def calc_recognition_performance(self, HResults_scp, lexicon_file=None): if lexicon_file == None: lexicon_file = self.lexicon_file self.command, self.output, self.error = run_command( ['HResults', '-T', '1', lexicon_file, '-S', HResults_scp]) return self._load_recognition_performance()
def recognition(self, lattice_file, hmmdefs, HVite_scp, lexicon_file=None): if lexicon_file == None: lexicon_file = self.lexicon_file self.command, self.output, self.error = run_command([ 'HVite', '-T', '1', '-C', self.config_rec, '-w', lattice_file, '-H', hmmdefs, lexicon_file, self.phonelist_txt, '-S', HVite_scp ]) return self._load_recognition_result()
def create_hmmdefs(self, proto, hmmdefs): """ allocate mean & variance to all phases in the phaselist """ _, output, _ = run_command( ['perl', HTK.mkhmmdefs_pl, proto, self.phonelist_txt]) if os.name == 'nt': output = output.replace('\r', '') output = output.replace('\n', '\r\n') with open(hmmdefs, 'wb') as f: f.write(bytes(output, 'ascii')) return
def _network2lattice(self, network_file, lattice_file): """creats word level lattice files from a text file syntax description containing a set of rewrite rules based on extended Backus-Naur Form (EBNF). Args: network_file: word network. lattice_file: word level lattice file. Reference: http://www1.icsi.berkeley.edu/Speech/docs/HTKBook/node247.html """ self.command, self.output, self.error = run_command( ['HParse', network_file, lattice_file]) return
def flat_start(self, HCompV_scp, model_dir): """ Args: config_train: HCompV_scp: a script file. model_dir: the directory. """ proto = r'c:\OneDrive\Research\rug\experiments\acoustic_model\fame\htk\config\proto' self._create_proto(proto) self.command, self.output, self.error = run_command([ 'HCompV', '-T', '1', '-C', self.config_train, '-f', '0.01', '-m', '-S', HCompV_scp, '-M', model_dir, proto ]) return
def re_estimation(self, hmmdefs, output_dir, HCompV_scp, mlf_file=None, macros=None): command_list = [] command_list.extend([ 'HERest', '-T', '1', '-C', self.config_train, '-v', '0.01', '-t', '250.0', '150.0', '1000.0', '-H', hmmdefs, '-M', output_dir ]) if not mlf_file == None: command_list.extend(['-I', mlf_file]) if not macros == None: command_list.extend(['-H', macros]) command_list.extend(['-S', HCompV_scp, self.phonelist_txt]) self.command, self.output, self.error = run_command(command_list) return
def create_dictionary(self, sentence, log_txt, dictionary_file): """ when the length of the filename exceeds 32 characters, error. """ label_file = NamedTemporaryFile(mode='w', delete=False, encoding='utf-8') label_file.close() self.create_label_file(sentence, label_file.name) phonelist_txt = NamedTemporaryFile(mode='w', delete=False, encoding='utf-8') phonelist_txt.close() self.command, self.output, self.error = run_command([ 'HDMan', '-w', label_file.name, '-g', self.global_ded, '-n', phonelist_txt.name, '-l', log_txt, dictionary_file, self.lexicon_file ]) os.remove(label_file.name) os.remove(phonelist_txt.name) return
def increase_mixture(hmmdefs, nmix, output_dir, phonelist_txt): fh.make_new_directory(output_dir) header_file = os.path.join(output_dir, 'mix' + str(nmix) + '.hed') with open(header_file, 'wb') as f: f.write(bytes('MU ' + str(nmix) + ' {*.state[2-4].mix}', 'ascii')) run_command([ 'HHEd', '-T', '1', '-H', hmmdefs, '-M', output_dir, header_file, phonelist_txt ]) #if __name__ == '__main__': #def txt2label(file_txt, label_file): # """ # Convert an orthographycal transcription to the HTK label. # :param path file_txt: path to the text file in which the orthographycal transcription of the utterance is written in one line. # :param path label_file: path to the text file in which the contents of file_txt is written as a word per line. # """ # # read the first line where the sentence is written. # with open(file_txt, 'r') as f: # line1 = f.readline() # # remove space at the end and comma # line1 = line1.rstrip() # line1 = line1.replace(',', '') # # write each word in a capital letter # line1list = line1.split(' ') # with open(label_file, 'w') as f: # for word in line1list: # f.write("%s\n" % word.upper()) #def lab2HTKdic(label_file, fileHTKdic, lex, connect): # """ # Make a HTK dictionary file from a HTK label file. # Each word in the label file is first searched in database. # If not found, the Grapheme-to-Phone(G2P) program is used. # :param path label_file: path to the text file in which the contents of file_txt is written as a word per line. # :param path fileHTKdic: path to the output dictionary file in which pronunciation variants will be written. # :param instance lex: an instance of class cLexicon. # :param pypyodbc connect: an object of pypyodbc. # """ # cursor = connect.cursor() # tableList = ['2010_2510_lexicon_pronvars_HTK', 'lexicon_pronvars_g2p', 'lexicon_pronvars_ipa'] # with open(label_file, 'r') as f: # lines = f.read() # words = lines.split() # with open(fileHTKdic, 'w') as f: # for WORD in words: # word = WORD.lower() # rows = np.array([]) # for tableNum in range(0, len(tableList)): # SQL_string = """SELECT word, pronunciation FROM %s WHERE word =?""" %(tableList[tableNum]) # cursor.execute(SQL_string, (word,)) # rows_ = cursor.fetchall() # rows = np.append(rows, rows_) # # if the pronunciations are found in the database, retreive them. # # otherwise obtain the pronunciation using g2p. # if len(rows) > 2: # # reshape into d x 2 array # htkdic_ = rows.reshape((-1, 2)) # # remove duplicates # htkdic = np.unique(htkdic_, axis=0) # else: # # get pronunciation using g2p # fileWordList = tempfile.NamedTemporaryFile(mode='w', delete=False) # fileWordList.write("%s\n" % (word)) # fileWordList.close() # htkdic = lex.g2p2db(fileWordList.name, connect) # fileWordList.close() # os.remove(fileWordList.name) # # output htkdic # for line in htkdic: # word = line[0].upper() # pron = line[1] # f.write("{0}\t{1}\n".format(word, pron)) #def loadHTKdic(fileHTKdic): # """ # load dic file which is used for HTK. # :param path fileHTKdic: path to the output dictionary file in which pronunciation variants will be written. # """ # HTKdic = [] # with open(fileHTKdic, 'r') as f: # lines = f.read() # lines = lines.split('\n') # for line in lines: # line = line.split('\t') # if len(line) > 1: # HTKdic.append(line) # return np.array(HTKdic) #def HTKdic2list(fileHTKdic): # """ # load an HTK dictionary file as a list. # :param path fileHTKdic: the path to an HTK dictionary file in which [word] /t [pronunciation] is written each line. # """ # with open(fileHTKdic, 'r') as fin: # lines_ = fin.read() # # split all text into lines # lines = lines_.split('\n') # htkdic = [] # for line in lines: # lineSplit = line.split('\t') # if len(lineSplit) == 2: # word = lineSplit[0].lower() # pronunciation = lineSplit[1] # htkdic.append([word, pronunciation]) # return np.array(htkdic) #def doHVite(fileWav, label_file, fileHTKdic, fileFA, configHVite, filePhoneList, AcousticModel): # """ # Forced alignment using HVite of HTK. # :param path fileWav: path to the wav file in which the utterance was recorded. # :param path label_file: path to the text file in which the contents of file_txt is written as a word per line. # :param path fileHTKdic: path to the HTK dictionary file in which pronunciation variants are written. # :param path fileFA: path to the output file in which forced alignment (100ns unit) will be written. # :param path configHVite: path to the config file of HVite. # :param path filePhoneList: path to the list of phone used in the acoustic model and in the HTK dictionary file. # :param path AcousticModel: path to the acoustic model. # """ # with open(label_file, 'r') as f: # lines = f.read() # # Master Label File (= list of label files.) # fileMlf = tempfile.NamedTemporaryFile(mode='w', delete=False) # fileMlf.write("#!MLF!#\n") # fileMlf.write('"' + label_file + '"\n') # fileMlf.write(lines) # fileMlf.close() # # script # fileScp = tempfile.NamedTemporaryFile(mode='w', delete=False) # fileScp.write("%s" % (fileWav)) # fileScp.close() # # HVite # subprocessStr = 'HVite -T 1 -a -C ' + configHVite + ' -H ' + AcousticModel + ' -m -I ' + fileMlf.name + ' -i ' + fileFA + ' -S ' + fileScp.name + ' ' + fileHTKdic + ' ' + filePhoneList # subprocess.call(subprocessStr, shell=True) # # termination process # os.remove(fileMlf.name) # os.remove(fileScp.name) #def conv100ns2ms(fileFA_in_100ns, fileFA_in_ms): # """ # Convert the unit of forced alignment from 100ns to ms. # :param path fileFA_in_100ns: path to the forced alignment file written in the unit of 100ns. # :param path fileFA_in_ms: path to the output forced alignment file written in the unit of ms. # """ # with open(fileFA_in_100ns, 'r') as fin: # with open(fileFA_in_ms, 'w') as fout: # line = fin.readline() # fout.write(line) # line = fin.readline() # fout.write(line) # while line: # line = fin.readline() # dur = line.split() # # Each line of forced alignment file is: # # [durStart] [durEnd] [phoneme] [likelihood] [word] # if len(dur) == 4 or len(dur) == 5: # # convert 100ns -> ms # dur[0] = float(dur[0])/10000 # dur[1] = float(dur[1])/10000 # if len(dur) == 4: # fout.write('{0} {1} {2} {3}\n'.format(dur[0], dur[1], dur[2], dur[3])) # elif len(dur) == 5: # fout.write('{0} {1} {2} {3} {4}\n'.format(dur[0], dur[1], dur[2], dur[3], dur[4])) # else: # fout.write(line) #def ForcedAlignment(fileWav, file_txt, fileOut, configFile, saveIntermediateFiles): # """ # Forced Alignment # :param path fileWav: path to the wav file in which the utterance was recorded. # :param path file_txt: path to the text file in which the orthographycal transcription of the utterance is written in one line. # :param path fileOut: path to the output file in which forced alignment will be written in the unit of ms. # :param path configFile: path to the general config file (not the config file for HVite). # :param int saveIntermediateFile: if intermediate files (label file and dictionary file) should be saved. 0:no, 1:yes # """ # # label file: the list of words that appears in the wave file. # # should be in the same folder where wav file is stored. # label_file = fileWav.replace('.wav', '.lab') # # dic file: the pronunciation dictionary in which pronunciation(s) of each word in the label file are described. # fileHTKdic = fileWav.replace('.wav', '.dic') # # forced alignment output # fileFA = tempfile.NamedTemporaryFile(delete=False) # fileFA.close() # # load the config file # config = configparser.ConfigParser() # config.sections() # config.read(configFile) # dbLexicon = config['cLexicon']['dbLexicon'] # scriptBarbara = config['cLexicon']['scriptBarbara'] # exeG2P = config['cLexicon']['exeG2P'] # configHVite = config['pyHTK']['configHVite'] # filePhoneList = config['pyHTK']['filePhoneList'] # AcousticModel = config['pyHTK']['AcousticModel'] # # make database connection # param = r"Driver={Microsoft Access Driver (*.mdb, *.accdb)};dbq=" + dbLexicon + ";" # connect = pypyodbc.connect(param) # # instance of class lexicon # if sys.platform == 'win32': # scriptBarbara = scriptBarbara.replace('\\\\', '\\') # lex = cLexicon.lexicon(scriptBarbara, exeG2P) # # load the orthographical transcription # # and output that word by word (e.g. one word per line) in capital letters... # txt2label(file_txt, label_file) # # for each words in the label file pronunciation(s) are searched in lexicon database... # lab2HTKdic(label_file, fileHTKdic, lex, connect) # # forced alignment using HVite # doHVite(fileWav, label_file, fileHTKdic, fileFA.name, configHVite, filePhoneList, AcousticModel) # conv100ns2ms(fileFA.name, fileOut) # # termination process # if saveIntermediateFiles == 0: # os.remove(label_file) # os.remove(fileHTKdic) # os.remove(fileFA.name) # connect.close()
def _tie_sp_to_sil(self, macros, hmmdefs, output_dir): self.command, self.output, self.error = run_command([ 'HHEd', '-H', macros, '-H', hmmdefs, '-M', output_dir, self.sil_hed, self.phonelist_txt ]) return
def wav2mfc(self, hcopy_scp): self.command, self.output, self.error = run_command( ['HCopy', '-C', self.config_hcopy, '-S', hcopy_scp]) return
def mlf_word2phone(self, mlf_phone, mlf_word): self.command, self.output, self.error = run_command([ 'HLEd', '-l', '*', '-d', self.lexicon_file, '-i', mlf_phone, self.mkphones_led, mlf_word ])