def arpa_to_binary(arpaFile, outFile): ''' Transform ARPA language model file to KenLM binary format file. Args: <arpaFile>: ARPA file path. <outFile>: output binary file path. Return: Then absolute path of output file. ''' assert isinstance(arpaFile, str), f"<arpaFile> should be a string." if not os.path.isfile(arpaFile): raise WrongPath(f"No such file:{arpaFile}.") assert isinstance(outFile, str), f"<outFile> should be a string." make_dependent_dirs(outFile) cmd = os.path.join(sys.prefix, "exkaldisrc", "tools", "build_binary") cmd += f" -s {arpaFile} {outFile}" out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE) if (cod != 0) or (not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) raise KenlmProcessError("Failed to tansform ARPA to binary format.") else: return os.path.abspath(outFile)
def arpa_to_binary(arpaFile, outFile): ''' Transform ARPA language model to KenLM binary format. Args: <arpaFile>: ARPA file path. <outFile>: output binary file path. Return: output file name with suffix ".binary". ''' declare.is_file("arpaFile", arpaFile) declare.is_valid_string("outFile", outFile) outFile = outFile.strip() if not outFile.endswith(".binary"): outFile += ".binary" declare.is_valid_file_name("outFile", outFile) make_dependent_dirs(outFile) cmd = os.path.join(sys.prefix, "exkaldisrc", "tools", "build_binary") cmd += f" -s {arpaFile} {outFile}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (cod != 0) or (not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KenlmProcessError("Failed to tansform ARPA to binary format.") else: return outFile
def save(self, fileName=None): ''' Save arguments to file with specified format. Args: _fileName_: Nonr, a resonable file name. Return: if fileName is None: return a string of all contents else: the saved file name ''' if fileName is not None: declare.is_valid_file_name("fileName", fileName) make_dependent_dirs(fileName, True) contents = [] contents.append(self.__discription) for name, info in self.__arguments.items(): # option name m = "\n" m += f"name={name}\n" # option value if isinstance(info.value,(list,tuple)): value="|".join(map(str,info.value)) else: value = info.value m += f"value={value}\n" # abbreviation and dtype m += f"abbr={self.__name2Abb[name]}\n" m += f"dtype={info.dtype.__name__}\n" # default if isinstance(info.default,(list,tuple)): default="|".join(map(str,info.default)) else: default = info.default m += f"default={default}\n" # choices if isinstance(info.choices,(list,tuple)): choices = "|".join(map(str,info.choices)) else: choices = info.choices m += f"choices={choices}\n" # boundary and discription m += f"minV={info.minV}\n" m += f"maxV={info.maxV}\n" m += f"discription={info.discription}" contents.append(m) contents = "\n".join(contents) + "\n" if fileName is not None: with open(fileName, "w", encoding="utf-8") as fw: fw.write(contents) return fileName else: return contents
def save(self, fileName): ''' Save lattice as .ali file. Args: <fileName>: file name. ''' assert isinstance(fileName, str) and len(fileName) > 0, "file name is unavaliable." if self.is_void: raise WrongOperation('No any data to save.') if not fileName.rstrip().endswith(".lat"): fileName += ".lat" make_dependent_dirs(fileName) with open(fileName, "wb") as fw: fw.write(self.data) return os.path.abspath(fileName)
def __init__(self, outDir='Result'): declare.is_valid_dir_name("outDir", outDir) make_dependent_dirs(outDir, pathIsFile=False) self.outDir = os.path.abspath(outDir) self.logFile = os.path.join(self.outDir, 'log') with open(self.logFile, 'w', encoding='utf-8'): pass self.currentField = {} self.currentFieldIsFloat = {} self.globalField = [] self.lastSavedArch = {} self.savedArchs = [] self.savingThreshold = None self._allKeys = [] self._iterSymbol = -1
def __init__(self, outDir='Result'): assert isinstance(outDir, str), "<outDir> should be a name-like string." make_dependent_dirs(outDir, pathIsFile=False) self.outDir = os.path.abspath(outDir) self.logFile = os.path.join(self.outDir, 'log') with open(self.logFile, 'w', encoding='utf-8'): pass self.currentField = {} self.currentFieldIsFloat = {} self.globalField = [] self.lastSavedArch = {} self.savedArchs = [] self.savingThreshold = None self._allKeys = [] self._iterSymbol = -1
def train_ngrams_srilm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. If you don't specified the discount by the <config> option, We defaultly use "kndiscount". Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_srilm") function to get a reference of extra configurations. Also you can run shell command "ngram-count" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) # verify the max order declare.less_equal("order", order, "max order", 9) # prepare srilm tool ExkaldiInfo.prepare_srilm() with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name unkSymbol = lexicons("oov") wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) extraConfig = " " specifyDiscount = False if config is not None: if check_config(name='train_ngrams_srilm', config=config): for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " if key.endswith("discount"): specifyDiscount = True else: extraConfig += f" {key} {value}" cmd = f'ngram-count -text {text} -order {order} -limit-vocab -vocab {wordlistTemp.name} -unk -map-unk "{unkSymbol}" ' if specifyDiscount is False: cmd += "-kndiscount " cmd += "-interpolate " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) cmd += f" -lm {outFile}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0: print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KaldiProcessError( f'Failed to generate N-Grams language model.') return outFile
def train_ngrams_kenlm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_kenlm") function to get a reference of extra configurations. Also you can run shell command "lmplz" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) declare.less_equal("order", order, "max order", 9) with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name extraConfig = " " if config is not None: if check_config(name='train_ngrams_kenlm', config=config): if "--temp_prefix" in config.keys() and "-T" in config.keys(): raise WrongOperation( f'"--temp_prefix" and "-T" is the same configuration so only one of them is expected.' ) if "--memory" in config.keys() and "-S" in config.keys(): raise WrongOperation( f'"--memory" and "-S" is the same configuration so only one of them is expected.' ) for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " else: extraConfig += f"{key} {value} " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words_count = math.ceil(len(words) / 10) * 10 words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz") cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {text} --arpa {outFile} --limit_vocab_file {wordlistTemp.name}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KenlmProcessError("Failed to generate arpa file.") return outFile
def run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,timeout=ExKaldiInfo.timeout,generateArchive=None,archiveNames=None): ''' Map resources to command pattern and run this command parallelly. Args: <resources>: a dict whose keys are the name of resource and values are lists of resources objects. For example: {"feat": [BytesFeat01,BytesFeat02,... ],"outFile":{"newFeat01.ark","newFeat02.ark",...} }. The "outFile" resource is necessary. When there is only one process to run,"outFile" can be "-" which means the standard output stream. <cmdPattern>: a string needed to map the resources. For example: "copy-feat {feat} ark:{outFile}". Return: a list of triples: (return code,error info,output file or buffer) ''' declare.kaldi_existed() declare.is_classes("resources",resources,dict) declare.is_classes("cmdPattern",cmdPattern,str) assert "outFile" in resources.keys(),"<outFile> key and value is necessary in recources." declare.members_are_classes("the values of resources",resources.values(),[list,tuple]) if generateArchive is not None: analyzeResult = True #forcely analyze the result # check the format of cmomand pattern nameIndexs = [ i for i,c in enumerate(cmdPattern) if c == "{" or c == "}" ] assert len(nameIndexs)%2 == 0,f"The numbers of braces do not match in command pattern: '{cmdPattern}'. " auxiliaryInfo = {} for i in range(0,len(nameIndexs),2): name = cmdPattern[nameIndexs[i]+1:nameIndexs[i+1]] if name not in resources: raise WrongDataFormat(f"Resource is necessary but has not been provided: {name}.") prefix = "" if nameIndexs[i] == 0 else cmdPattern[nameIndexs[i]-1] if name in auxiliaryInfo.keys(): auxiliaryInfo[name][0] += 1 if not prefix in auxiliaryInfo[name][1]: auxiliaryInfo[name][1] += prefix else: auxiliaryInfo[name] = [1,prefix] assert "outFile" in auxiliaryInfo.keys(),"Key: <outFile> is necessary in command pattern." _outFileCountInfo = auxiliaryInfo.pop("outFile") assert _outFileCountInfo[0] == 1,f"Only allow <outFile> appear one time in command pattern but: {_outFileCountInfo[0]}." outFiles = resources.pop("outFile") for outFile in outFiles: if outFile != "-": make_dependent_dirs(outFile,pathIsFile=True) parallel = len(outFiles) if generateArchive is not None: declare.is_instances("generateArchive",generateArchive,["feat","cmvn","ali","fmllr"]) if archiveNames is None: archiveNames = [ generateArchive for i in range(parallel)] elif isinstance(archiveNames,str): archiveNames = [ archiveNames for i in range(parallel)] elif isinstance(archiveNames,(list,tuple)): declare.equal("the number of achieve names",len(archiveNames),"parallel",parallel) else: raise UnsupportedType(f"<archiveNames> should be string or list or tuple but got: {type_name(archiveNames)}.") # regulate resources and run with FileHandleManager() as fhm: newResources = {} if parallel == 1: # Detect whether there is PIPE in command pattern. testPlaceholder = dict( (key,value[0]) if isinstance(value[0],str) else (key,"placeholder") for key,value in resources.items() ) testPlaceholder["outFile"] = "placeholder" testCmd = cmdPattern.format(**testPlaceholder) if "|" in testCmd: inputsBuffer = False else: inputsBuffer = True del testPlaceholder # regularate resources for key,countPrefix in auxiliaryInfo.items(): count,prefix = countPrefix target = resources[key][0] # If target is a list-table,we can not automatically decide whether it is scp-format or ark-format. # So you should appoint it in the command parttern. if type_name(target) in ["ListTable","Transcription"]: if prefix not in [":","="]: errMes = f"There might miss prefix such as 'ark:' or 'scp:' or '--option=' in command pattern before resource: {key}." errMes += "Check the command line please. If you still think there dose not need the prefix," errMes += "save this ListTable or Transcription into file and instead it will this file name." errMes += "In that case,we will skip checking the prefix." raise WrongOperation(errMes) target = target.sort() if (inputsBuffer is True) and count == 1: inputsBuffer = target.save() newResources[key] = "-" else: targetTemp = fhm.create("w+",encoding="utf-8") target.save(targetTemp) newResources[key] = f"{targetTemp.name}" # If target is an index-table,we automatically recognize it as scp-file,so you do not need appoint it. elif type_name(target) == "IndexTable": if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() if (inputsBuffer is True) and count == 1: inputsBuffer = target.save() newResources[key] = "scp:-" else: targetTemp = fhm.create("w+",suffix=".scp",encoding="utf-8") target.save(targetTemp) newResources[key] = f"scp:{targetTemp.name}" elif isinstance(target,(str,int,float)): # file or other value parameter newResources[key] = f"{target}" elif isinstance(target,(BytesMatrix,BytesVector)): if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() if (inputsBuffer is True) and count == 1: inputsBuffer = target.data newResources[key] = "ark:-" else: targetTemp = fhm.create("wb+",suffix=".ark") target.save(targetTemp) newResources[key] = f"ark:{targetTemp.name}" elif isinstance(target,(NumpyMatrix,NumpyVector)): if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() if (inputsBuffer is True) and count == 1: inputsBuffer = target.to_bytes().data newResources[key] = "ark:-" else: target = target.to_bytes() targetTemp = fhm.create("wb+",suffix=".ark") target.save(targetTemp) newResources[key] = f"ark:{targetTemp.name}" elif isinstance(target,BytesArchive): if (inputsBuffer is True) and count == 1: inputsBuffer = target.data newResources[key] = "-" else: targetTemp = fhm.create("wb+") target.save(targetTemp) newResources[key] = f"{targetTemp.name}" else: raise UnsupportedType(f"<target> should be IndexTable,ListTable,file name,int or float value,or exkaldi achieve object but got: {type_name(target)}.") # Then,process output stream outFile = outFiles[0] newResources["outFile"] = outFile inputsBuffer = None if isinstance(inputsBuffer,bool) else inputsBuffer # Then rum command finalCmd = cmdPattern.format(**newResources) out,err,cod = run_shell_command(finalCmd,stdin="PIPE",stdout="PIPE",stderr="PIPE",inputs=inputsBuffer) if analyzeResult: if cod != 0: finalCmd = ",".join([cmd.strip().split(maxsplit=1)[0] for cmd in finalCmd.split("|")]) raise KaldiProcessError(f"Failed to run Kaldi command: {finalCmd}.",err.decode()) if outFile == "-": if generateArchive is not None: if generateArchive == "feat": out = BytesFeat(data=out,name=archiveNames[0]) elif generateArchive == "ali": out = BytesAliTrans(data=out,name=archiveNames[0]) elif generateArchive == "cmvn": out = BytesCMVN(data=out,name=archiveNames[0]) else: out = BytesFmllr(data=out,name=archiveNames[0]) return out else: return (cod,err,out) else: if generateArchive is not None: return load_index_table(outFile,name=archiveNames[0],useSuffix="ark") else: return (cod,err,outFile) else: # In this case,all input IO stream must be files. for key,countPrefix in auxiliaryInfo.items(): count,prefix = countPrefix values = resources[key] newValues = [] for target in values: # If target is scp resource if type_name(target) in ["ListTable","Transcription"]: if prefix not in [":","="]: errMes = f"There might miss prefix such as 'ark:' or 'scp:' or '--option=' in command pattern before resource: {key}." errMes += "Check the command line please. If you still think there dose not need the prefix," errMes += "save this ListTable or Transcription into file and instead it will this file name." errMes += "In that case,we will skip checking the prefix." raise WrongOperation(errMes) target = target.sort() targetTemp = fhm.create("w+",encoding="utf-8") target.save(targetTemp) newValues.append(f"{targetTemp.name}") elif type_name(target) == "IndexTable": if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() targetTemp = fhm.create("w+",suffix=".scp",encoding="utf-8") target.save(targetTemp) newValues.append(f"scp:{targetTemp.name}") elif isinstance(target,(str,float,int)): # file name or other value parameters newValues.append(f"{target}") elif isinstance(target,(BytesMatrix,BytesVector)): if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() targetTemp = fhm.create("wb+",suffix=".ark") target.save(targetTemp) newValues.append(f"ark:{targetTemp.name}") elif isinstance(target,(NumpyMatrix,NumpyVector)): if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort().to_bytes() targetTemp = fhm.create("wb+",suffix=".ark") target.save(targetTemp) newValues.append(f"ark:{targetTemp.name}") elif isinstance(target,BytesArchive): targetTemp = fhm.create("wb+") target.save(targetTemp) newValues.append(f"{targetTemp.name}") else: raise UnsupportedType(f"<target> should be IndexTable,ListTable,Transcription,file,int or float values or exkaldi achieve object but got: {type_name(target)}.") newResources[key] = newValues newResources["outFile"] = outFiles # assign these resources to each process and generate multiple commands parallelResources = [] for i in range(parallel): parallelResources.append({}) for key,items in newResources.items(): parallelResources[-1][key] = items[i] cmds = [ cmdPattern.format(**re) for re in parallelResources ] # run flags = run_shell_command_parallel(cmds,timeout=timeout) finalResult = [] done = True for index,info in enumerate(flags): cod,err = info if analyzeResult and cod != 0: print(f"{index}/{len(flags)} error tracking") print(err.decode()) done = False finalResult.append( (cod,err,outFiles[index]) ) if analyzeResult and (not done): finalCmd = ",".join([cmd.strip().split(maxsplit=1)[0] for cmd in cmds[0].split("|")]) raise KaldiProcessError(f"Failed to run Kaldi command: {finalCmd}. Look the error messages above.") else: if generateArchive is not None: for i,fileName in enumerate(outFiles): finalResult[i] = load_index_table(fileName,name=archiveNames[i],useSuffix="ark") return finalResult
def train_ngrams_srilm(lexicons, order, textFile, outFile, config=None): ''' Train n-grams language model with Srilm tookit. Args: <lexicons>: words.txt file path or Exkaldi LexiconBank object. <order>: the maxinum order of n-grams. <textFile>: text corpus file. <outFile>: ARPA out file name. <config>: configures, a Python dict object. You can use .check_config("train_ngrams_srilm") function to get configure information that you can set. Also you can run shell command "lmplz" to look their meaning. ''' assert isinstance( order, int ) and order > 0 and order < 10, "Expected <n> is a positive int value and it must be smaller than 10." assert isinstance(textFile, str), "Expected <textFile> is name-like string." assert isinstance(outFile, str), "Expected <outFile> is name-like string." assert type_name( lexicons ) == "LexiconBank", f"Expected <lexicons> is exkaldi LexiconBank object but got {type_name(lexicons)}." ExkaldiInfo.prepare_srilm() if not os.path.isfile(textFile): raise WrongPath(f"No such file:{textFile}") else: ## Should check the numbers of lines cmd = f"shuf {textFile} -n 100" out, err, cod = run_shell_command(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError("Failed to sample from text file.") elif out == b'': raise WrongDataFormat("Void text file.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or extremely short." ) wordlist = tempfile.NamedTemporaryFile("w+", encoding='utf-8', suffix=".txt") unkSymbol = lexicons("oov") try: lexiconp = lexicons("lexiconp") words = [x[0] for x in lexiconp.keys()] wordlist.write("\n".join(words)) wordlist.seek(0) #cmd2 = f"ngram-count -text {textFile} -order {order}" extraConfig = " " specifyDiscount = False if config is not None: if check_config(name='train_ngrams_srilm', config=config): for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " if key.endswith("discount"): specifyDiscount = True else: extraConfig += f" {key} {value}" cmd = f"ngram-count -text {textFile} -order {order} -limit-vocab -vocab {wordlist.name} -unk -map-unk {unkSymbol} " if specifyDiscount is False: cmd += "-kndiscount " cmd += "-interpolate " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) cmd += f" -lm {outFile}" out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0: print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KaldiProcessError( f'Failed to generate ngrams language model.') else: return os.path.abspath(outFile) finally: wordlist.close()
def train_ngrams_kenlm(lexicons, order, textFile, outFile, config=None): ''' Train n-grams language model with KenLm tookit. Args: <lexicons>: words.txt file path or Exkaldi LexiconBank object. <order>: the maxinum order of n-grams. <textFile>: text corpus file. <outFile>: ARPA out file name. <config>: configures, a Python dict object. You can use .check_config("train_ngrams_kenlm") function to get configure information that you can set. Also you can run shell command "lmplz" to look their meaning. ''' assert isinstance( order, int ) and 0 < order <= 6, "We support maximum 6-grams LM in current version." if not os.path.isfile(textFile): raise WrongPath("No such file:{}".format(textFile)) else: ## Should check the numbers of lines cmd = f"shuf {textFile} -n 100" out, err, cod = run_shell_command(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError("Failed to sample from text file.") elif out == b'': raise WrongDataFormat("Void text file.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or extremely short." ) extraConfig = " " if config != None: assert isinstance( config, dict ), f"<config> should be dict object but got: {type_name(config)}." if check_config(name='train_ngrams_kenlm', config=config): if "--temp_prefix" in config.keys() and "-T" in config.keys(): raise WrongOperation( f'"--temp_prefix" and "-T" is the same configure so only one of them is expected.' ) if "--memory" in config.keys() and "-S" in config.keys(): raise WrongOperation( f'"--memory" and "-S" is the same configure so only one of them is expected.' ) for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " else: extraConfig += f"{key} {value} " assert isinstance(outFile, str), f"<outFile> should be a string." if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) words = tempfile.NamedTemporaryFile("w+", suffix=".txt", encoding="utf-8") try: if type_name(lexicons) == "LexiconBank": ws = lexicons("words") words_count = math.ceil(len(ws) / 10) * 10 ws = "\n".join(ws.keys()) elif isinstance(lexicons, str): if not os.path.isfile(lexicons): raise WrongPath(f"No such file:{lexicons}.") with open(lexicons, "r", encoding="utf-8") as fr: lines = fr.readlines() ws = [] for line in lines: line = line.strip().split(maxsplit=1) if len(line) < 1: continue else: ws.append(line[0]) words_count = math.ceil(len(ws) / 10) * 10 ws = "\n".join(ws) else: raise UnsupportedType( "<lexicons> should be LexiconBank object or file path.") words.write(ws) words.seek(0) KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz") cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {textFile} --arpa {outFile} --limit_vocab_file {words.name}" out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) raise KenlmProcessError("Failed to generate arpa file.") else: return os.path.abspath(outFile) finally: words.close()