Ejemplo n.º 1
0
def arpa_to_binary(arpaFile, outFile):
    '''
	Transform ARPA language model file to KenLM binary format file.

	Args:
		<arpaFile>: ARPA file path.
		<outFile>: output binary file path.
	Return:
		Then absolute path of output file.
	'''
    assert isinstance(arpaFile, str), f"<arpaFile> should be a string."
    if not os.path.isfile(arpaFile):
        raise WrongPath(f"No such file:{arpaFile}.")

    assert isinstance(outFile, str), f"<outFile> should be a string."
    make_dependent_dirs(outFile)

    cmd = os.path.join(sys.prefix, "exkaldisrc", "tools", "build_binary")
    cmd += f" -s {arpaFile} {outFile}"
    out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

    if (cod != 0) or (not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                       == 0):
        print(err.decode())
        raise KenlmProcessError("Failed to tansform ARPA to binary format.")

    else:
        return os.path.abspath(outFile)
Ejemplo n.º 2
0
def arpa_to_binary(arpaFile, outFile):
    '''
	Transform ARPA language model to KenLM binary format.

	Args:
		<arpaFile>: ARPA file path.
		<outFile>: output binary file path.

	Return:
		output file name with suffix ".binary".
	'''
    declare.is_file("arpaFile", arpaFile)
    declare.is_valid_string("outFile", outFile)
    outFile = outFile.strip()
    if not outFile.endswith(".binary"):
        outFile += ".binary"

    declare.is_valid_file_name("outFile", outFile)
    make_dependent_dirs(outFile)

    cmd = os.path.join(sys.prefix, "exkaldisrc", "tools", "build_binary")
    cmd += f" -s {arpaFile} {outFile}"
    out, err, cod = run_shell_command(cmd, stderr="PIPE")

    if (cod != 0) or (not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                       == 0):
        print(err.decode())
        if os.path.isfile(outFile):
            os.remove(outFile)
        raise KenlmProcessError("Failed to tansform ARPA to binary format.")

    else:
        return outFile
Ejemplo n.º 3
0
	def save(self, fileName=None):
		'''
		Save arguments to file with specified format.

		Args:
			_fileName_: Nonr, a resonable file name.
		
		Return:
			if fileName is None:
				return a string of all contents
			else:
				the saved file name
		'''
		if fileName is not None:
			declare.is_valid_file_name("fileName", fileName)
			make_dependent_dirs(fileName, True)

		contents = []
		contents.append(self.__discription)
		for name, info in self.__arguments.items():
			# option name
			m = "\n"
			m += f"name={name}\n"
			# option value
			if isinstance(info.value,(list,tuple)):
				value="|".join(map(str,info.value))
			else:
				value = info.value
			m += f"value={value}\n"
			# abbreviation and dtype
			m += f"abbr={self.__name2Abb[name]}\n"
			m += f"dtype={info.dtype.__name__}\n"
			# default
			if isinstance(info.default,(list,tuple)):
				default="|".join(map(str,info.default))
			else:
				default = info.default
			m += f"default={default}\n"
			# choices
			if isinstance(info.choices,(list,tuple)):
				choices = "|".join(map(str,info.choices))
			else:
				choices = info.choices
			m += f"choices={choices}\n"
			# boundary and discription
			m += f"minV={info.minV}\n"
			m += f"maxV={info.maxV}\n"
			m += f"discription={info.discription}"
			contents.append(m)
		
		contents = "\n".join(contents) + "\n"

		if fileName is not None:
			with open(fileName, "w", encoding="utf-8") as fw:
				fw.write(contents)
			return fileName
		else:
			return contents
Ejemplo n.º 4
0
	def save(self, fileName):
		'''
		Save lattice as .ali file. 
		
		Args:
			<fileName>: file name.
		''' 
		assert isinstance(fileName, str) and len(fileName) > 0, "file name is unavaliable."

		if self.is_void:
			raise WrongOperation('No any data to save.')

		if not fileName.rstrip().endswith(".lat"):
			fileName += ".lat"
		
		make_dependent_dirs(fileName)

		with open(fileName, "wb") as fw:
			fw.write(self.data)

		return os.path.abspath(fileName)
Ejemplo n.º 5
0
    def __init__(self, outDir='Result'):

        declare.is_valid_dir_name("outDir", outDir)
        make_dependent_dirs(outDir, pathIsFile=False)
        self.outDir = os.path.abspath(outDir)

        self.logFile = os.path.join(self.outDir, 'log')
        with open(self.logFile, 'w', encoding='utf-8'):
            pass

        self.currentField = {}
        self.currentFieldIsFloat = {}
        self.globalField = []

        self.lastSavedArch = {}
        self.savedArchs = []
        self.savingThreshold = None

        self._allKeys = []

        self._iterSymbol = -1
Ejemplo n.º 6
0
    def __init__(self, outDir='Result'):

        assert isinstance(outDir,
                          str), "<outDir> should be a name-like string."
        make_dependent_dirs(outDir, pathIsFile=False)
        self.outDir = os.path.abspath(outDir)

        self.logFile = os.path.join(self.outDir, 'log')
        with open(self.logFile, 'w', encoding='utf-8'):
            pass

        self.currentField = {}
        self.currentFieldIsFloat = {}
        self.globalField = []

        self.lastSavedArch = {}
        self.savedArchs = []
        self.savingThreshold = None

        self._allKeys = []

        self._iterSymbol = -1
Ejemplo n.º 7
0
def train_ngrams_srilm(lexicons, order, text, outFile, config=None):
    '''
	Train N-Grams language model with SriLM tookit.
	If you don't specified the discount by the <config> option, We defaultly use "kndiscount".

	Args:
		<lexicons>: an exkaldi LexiconBank object.
		<order>: the maximum order of N-Grams.
		<text>: a text corpus file or an exkaldi transcription object.
		<outFile>: output file name of arpa LM.
		<config>: extra configurations, a Python dict object.

	You can use .check_config("train_ngrams_srilm") function to get a reference of extra configurations.
	Also you can run shell command "ngram-count" to look their usage.
	'''
    declare.is_lexicon_bank("lexicons", lexicons)
    declare.is_positive_int("order", order)
    declare.is_potential_transcription("text", text)
    declare.is_valid_file_name("outFile", outFile)
    # verify the max order
    declare.less_equal("order", order, "max order", 9)
    # prepare srilm tool
    ExkaldiInfo.prepare_srilm()

    with FileHandleManager() as fhm:
        # check whether this is a reasonable text corpus that should be splited by space.
        if isinstance(text, str):
            cmd = f"shuf {text} -n 100"
            out, err, cod = run_shell_command(cmd,
                                              stdout="PIPE",
                                              stderr="PIPE")
            if (isinstance(cod, int) and cod != 0):
                print(err.decode())
                raise ShellProcessError(
                    f"Failed to sample from text file:{text}.")
            elif out == b'':
                raise WrongDataFormat(f"Void text file:{text}.")
            else:
                out = out.decode().strip().split("\n")
                spaceCount = 0
                for line in out:
                    spaceCount += line.count(" ")
                if spaceCount < len(out) // 2:
                    raise WrongDataFormat(
                        "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                    )

        else:
            sampleText = text.subset(nRandom=100)
            spaceCount = 0
            for key, value in sampleText.items():
                assert isinstance(
                    value, str
                ), f"Transcription must be string but got: {type_name(value)}."
                spaceCount += value.count(" ")
            if spaceCount < len(sampleText) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                )
            textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8")
            text.save(textTemp, discardUttID=True)
            text = textTemp.name

        unkSymbol = lexicons("oov")

        wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt")
        words = lexicons("words")
        words = "\n".join(words.keys())
        wordlistTemp.write(words)
        wordlistTemp.seek(0)

        extraConfig = " "
        specifyDiscount = False
        if config is not None:
            if check_config(name='train_ngrams_srilm', config=config):
                for key, value in config.items():
                    if isinstance(value, bool):
                        if value is True:
                            extraConfig += f"{key} "
                        if key.endswith("discount"):
                            specifyDiscount = True
                    else:
                        extraConfig += f" {key} {value}"

        cmd = f'ngram-count -text {text} -order {order} -limit-vocab -vocab {wordlistTemp.name} -unk -map-unk "{unkSymbol}" '
        if specifyDiscount is False:
            cmd += "-kndiscount "
        cmd += "-interpolate "

        if not outFile.rstrip().endswith(".arpa"):
            outFile += ".arpa"
        make_dependent_dirs(outFile, pathIsFile=True)
        cmd += f" -lm {outFile}"

        out, err, cod = run_shell_command(cmd, stderr="PIPE")

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0:
            print(err.decode())
            if os.path.isfile(outFile):
                os.remove(outFile)
            raise KaldiProcessError(
                f'Failed to generate N-Grams language model.')

        return outFile
Ejemplo n.º 8
0
def train_ngrams_kenlm(lexicons, order, text, outFile, config=None):
    '''
	Train N-Grams language model with SriLM tookit.

	Args:
		<lexicons>: an exkaldi LexiconBank object.
		<order>: the maximum order of N-Grams.
		<text>: a text corpus file or an exkaldi transcription object.
		<outFile>: output file name of arpa LM.
		<config>: extra configurations, a Python dict object.

	You can use .check_config("train_ngrams_kenlm") function to get a reference of extra configurations.
	Also you can run shell command "lmplz" to look their usage.
	'''
    declare.is_lexicon_bank("lexicons", lexicons)
    declare.is_positive_int("order", order)
    declare.is_potential_transcription("text", text)
    declare.is_valid_file_name("outFile", outFile)

    declare.less_equal("order", order, "max order", 9)

    with FileHandleManager() as fhm:
        # check whether this is a reasonable text corpus that should be splited by space.
        if isinstance(text, str):
            cmd = f"shuf {text} -n 100"
            out, err, cod = run_shell_command(cmd,
                                              stdout="PIPE",
                                              stderr="PIPE")
            if (isinstance(cod, int) and cod != 0):
                print(err.decode())
                raise ShellProcessError(
                    f"Failed to sample from text file:{text}.")
            elif out == b'':
                raise WrongDataFormat(f"Void text file:{text}.")
            else:
                out = out.decode().strip().split("\n")
                spaceCount = 0
                for line in out:
                    spaceCount += line.count(" ")
                if spaceCount < len(out) // 2:
                    raise WrongDataFormat(
                        "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                    )

        else:
            sampleText = text.subset(nRandom=100)
            spaceCount = 0
            for key, value in sampleText.items():
                assert isinstance(
                    value, str
                ), f"Transcription must be string but got: {type_name(value)}."
                spaceCount += value.count(" ")
            if spaceCount < len(sampleText) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or sentences are extremely short."
                )
            textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8")
            text.save(textTemp, discardUttID=True)
            text = textTemp.name

        extraConfig = " "
        if config is not None:
            if check_config(name='train_ngrams_kenlm', config=config):
                if "--temp_prefix" in config.keys() and "-T" in config.keys():
                    raise WrongOperation(
                        f'"--temp_prefix" and "-T" is the same configuration so only one of them is expected.'
                    )
                if "--memory" in config.keys() and "-S" in config.keys():
                    raise WrongOperation(
                        f'"--memory" and "-S" is the same configuration so only one of them is expected.'
                    )
                for key, value in config.items():
                    if isinstance(value, bool):
                        if value is True:
                            extraConfig += f"{key} "
                    else:
                        extraConfig += f"{key} {value} "

        if not outFile.rstrip().endswith(".arpa"):
            outFile += ".arpa"
        make_dependent_dirs(outFile, pathIsFile=True)

        wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt")
        words = lexicons("words")
        words_count = math.ceil(len(words) / 10) * 10
        words = "\n".join(words.keys())
        wordlistTemp.write(words)
        wordlistTemp.seek(0)

        KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz")

        cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {text} --arpa {outFile} --limit_vocab_file {wordlistTemp.name}"
        out, err, cod = run_shell_command(cmd, stderr="PIPE")

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                 == 0):
            print(err.decode())
            if os.path.isfile(outFile):
                os.remove(outFile)
            raise KenlmProcessError("Failed to generate arpa file.")

        return outFile
Ejemplo n.º 9
0
def run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,timeout=ExKaldiInfo.timeout,generateArchive=None,archiveNames=None):
	'''
	Map resources to command pattern and run this command parallelly.

	Args:
		<resources>: a dict whose keys are the name of resource and values are lists of resources objects.
					For example: {"feat": [BytesFeat01,BytesFeat02,... ],"outFile":{"newFeat01.ark","newFeat02.ark",...} }.
					The "outFile" resource is necessary.
					When there is only one process to run,"outFile" can be "-" which means the standard output stream.

		<cmdPattern>: a string needed to map the resources.
					For example: "copy-feat {feat} ark:{outFile}".
	
	Return:
		a list of triples: (return code,error info,output file or buffer)
	'''
	declare.kaldi_existed()
	declare.is_classes("resources",resources,dict)
	declare.is_classes("cmdPattern",cmdPattern,str)
	assert "outFile" in resources.keys(),"<outFile> key and value is necessary in recources."

	declare.members_are_classes("the values of resources",resources.values(),[list,tuple])
	if generateArchive is not None:
		analyzeResult = True #forcely analyze the result

	# check the format of cmomand pattern
	nameIndexs = [ i for i,c in enumerate(cmdPattern) if c == "{" or c == "}" ]
	assert len(nameIndexs)%2 == 0,f"The numbers of braces do not match in command pattern: '{cmdPattern}'. "
	auxiliaryInfo = {}
	for i in range(0,len(nameIndexs),2):
		name = cmdPattern[nameIndexs[i]+1:nameIndexs[i+1]]
		if name not in resources:
			raise WrongDataFormat(f"Resource is necessary but has not been provided: {name}.")
		prefix = "" if nameIndexs[i] == 0 else cmdPattern[nameIndexs[i]-1]
		if name in auxiliaryInfo.keys():
			auxiliaryInfo[name][0] += 1
			if not prefix in auxiliaryInfo[name][1]:
				auxiliaryInfo[name][1] += prefix
		else:
			auxiliaryInfo[name] = [1,prefix]

	assert "outFile" in auxiliaryInfo.keys(),"Key: <outFile> is necessary in command pattern."
	_outFileCountInfo = auxiliaryInfo.pop("outFile")
	assert _outFileCountInfo[0] == 1,f"Only allow <outFile> appear one time in command pattern but: {_outFileCountInfo[0]}."
	outFiles = resources.pop("outFile")

	for outFile in outFiles:
		if outFile != "-":
			make_dependent_dirs(outFile,pathIsFile=True)
	parallel = len(outFiles)

	if generateArchive is not None:
		declare.is_instances("generateArchive",generateArchive,["feat","cmvn","ali","fmllr"])
		if archiveNames is None:
			archiveNames = [ generateArchive for i in range(parallel)]
		elif isinstance(archiveNames,str):
			archiveNames = [ archiveNames for i in range(parallel)]
		elif isinstance(archiveNames,(list,tuple)):
			declare.equal("the number of achieve names",len(archiveNames),"parallel",parallel)
		else:
			raise UnsupportedType(f"<archiveNames> should be string or list or tuple but got: {type_name(archiveNames)}.")

	# regulate resources and run
	with FileHandleManager() as fhm:

		newResources = {}
		if parallel == 1:
			# Detect whether there is PIPE in command pattern.
			testPlaceholder = dict( (key,value[0]) if isinstance(value[0],str) else (key,"placeholder") for key,value in resources.items() )
			testPlaceholder["outFile"] = "placeholder"
			testCmd = cmdPattern.format(**testPlaceholder)
			if "|" in testCmd:
				inputsBuffer = False
			else:
				inputsBuffer = True
			del testPlaceholder
			# regularate resources
			for key,countPrefix in auxiliaryInfo.items():
				count,prefix = countPrefix
				target = resources[key][0]

				# If target is a list-table,we can not automatically decide whether it is scp-format or ark-format.
				# So you should appoint it in the command parttern.
				if type_name(target) in ["ListTable","Transcription"]:
					if prefix not in [":","="]:
						errMes = f"There might miss prefix such as 'ark:' or 'scp:' or '--option=' in command pattern before resource: {key}."
						errMes += "Check the command line please. If you still think there dose not need the prefix,"
						errMes += "save this ListTable or Transcription into file and instead it will this file name."
						errMes += "In that case,we will skip checking the prefix."
						raise WrongOperation(errMes)

					target = target.sort()
					if (inputsBuffer is True) and count == 1:
						inputsBuffer = target.save()
						newResources[key] = "-"
					else:
						targetTemp = fhm.create("w+",encoding="utf-8")
						target.save(targetTemp)
						newResources[key] = f"{targetTemp.name}"

				# If target is an index-table,we automatically recognize it as scp-file,so you do not need appoint it.
				elif type_name(target) == "IndexTable":
					if prefix != " ":
						errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}."
						errMes += f"Because we will decide the prefix depending on its data type."
						raise WrongOperation(errMes)
						
					target = target.sort()
					if (inputsBuffer is True) and count == 1:
						inputsBuffer = target.save()
						newResources[key] = "scp:-"
					else:
						targetTemp = fhm.create("w+",suffix=".scp",encoding="utf-8")
						target.save(targetTemp)
						newResources[key] = f"scp:{targetTemp.name}"
				
				elif isinstance(target,(str,int,float)):
					# file or other value parameter
					newResources[key] = f"{target}"
			
				elif isinstance(target,(BytesMatrix,BytesVector)):
					if prefix != " ":
						errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}."
						errMes += f"Because we will decide the prefix depending on its data type."						
						raise WrongOperation(errMes)

					target = target.sort()
					if (inputsBuffer is True) and count == 1:
						inputsBuffer = target.data
						newResources[key] = "ark:-"
					else:					
						targetTemp = fhm.create("wb+",suffix=".ark")
						target.save(targetTemp)
						newResources[key] = f"ark:{targetTemp.name}"		

				elif isinstance(target,(NumpyMatrix,NumpyVector)):
					if prefix != " ":
						errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}."
						errMes += f"Because we will decide the prefix depending on its data type."		
						raise WrongOperation(errMes)

					target = target.sort()
					if (inputsBuffer is True) and count == 1:
						inputsBuffer = target.to_bytes().data
						newResources[key] = "ark:-"
					else:
						target = target.to_bytes()
						targetTemp = fhm.create("wb+",suffix=".ark")
						target.save(targetTemp)
						newResources[key] = f"ark:{targetTemp.name}"	

				elif isinstance(target,BytesArchive):
					if (inputsBuffer is True) and count == 1:
						inputsBuffer = target.data
						newResources[key] = "-"
					else:
						targetTemp = fhm.create("wb+")
						target.save(targetTemp)
						newResources[key] = f"{targetTemp.name}"

				else:
					raise UnsupportedType(f"<target> should be IndexTable,ListTable,file name,int or float value,or exkaldi achieve object but got: {type_name(target)}.")
			
			# Then,process output stream
			outFile = outFiles[0]
			newResources["outFile"] = outFile
			inputsBuffer = None if isinstance(inputsBuffer,bool) else inputsBuffer
			# Then rum command
			finalCmd = cmdPattern.format(**newResources)
			out,err,cod = run_shell_command(finalCmd,stdin="PIPE",stdout="PIPE",stderr="PIPE",inputs=inputsBuffer)
			
			if analyzeResult:
				if cod != 0:
					finalCmd = ",".join([cmd.strip().split(maxsplit=1)[0] for cmd in finalCmd.split("|")])
					raise KaldiProcessError(f"Failed to run Kaldi command: {finalCmd}.",err.decode())
			
			if outFile == "-":
				if generateArchive is not None:
					if generateArchive == "feat":
						out = BytesFeat(data=out,name=archiveNames[0])
					elif generateArchive == "ali":
						out = BytesAliTrans(data=out,name=archiveNames[0])
					elif generateArchive == "cmvn":
						out = BytesCMVN(data=out,name=archiveNames[0])
					else:
						out = BytesFmllr(data=out,name=archiveNames[0])
					return out
				else:
					return (cod,err,out)
			else:
				if generateArchive is not None:
					return load_index_table(outFile,name=archiveNames[0],useSuffix="ark")
				else:
					return (cod,err,outFile)

		else:
			# In this case,all input IO stream must be files.
			for key,countPrefix in auxiliaryInfo.items():
				count,prefix = countPrefix
				values = resources[key]
				newValues = []
				for target in values:

					# If target is scp resource
					if type_name(target) in ["ListTable","Transcription"]:
						if prefix not in [":","="]:
							errMes = f"There might miss prefix such as 'ark:' or 'scp:' or '--option=' in command pattern before resource: {key}."
							errMes += "Check the command line please. If you still think there dose not need the prefix,"
							errMes += "save this ListTable or Transcription into file and instead it will this file name."
							errMes += "In that case,we will skip checking the prefix."
							raise WrongOperation(errMes)		

						target = target.sort()
						targetTemp = fhm.create("w+",encoding="utf-8")
						target.save(targetTemp)
						newValues.append(f"{targetTemp.name}")						

					elif type_name(target) == "IndexTable":
						if prefix != " ":
							errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}."
							errMes += f"Because we will decide the prefix depending on its data type."
							raise WrongOperation(errMes)		

						target = target.sort()
						targetTemp = fhm.create("w+",suffix=".scp",encoding="utf-8")
						target.save(targetTemp)
						newValues.append(f"scp:{targetTemp.name}")
				
					elif isinstance(target,(str,float,int)):
						# file name or other value parameters
						newValues.append(f"{target}")
				
					elif isinstance(target,(BytesMatrix,BytesVector)):
						if prefix != " ":
							errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}."
							errMes += f"Because we will decide the prefix depending on its data type."						
							raise WrongOperation(errMes)	

						target = target.sort()
						targetTemp = fhm.create("wb+",suffix=".ark")
						target.save(targetTemp)
						newValues.append(f"ark:{targetTemp.name}")			

					elif isinstance(target,(NumpyMatrix,NumpyVector)):
						if prefix != " ":
							errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}."
							errMes += f"Because we will decide the prefix depending on its data type."						
							raise WrongOperation(errMes)

						target = target.sort().to_bytes()
						targetTemp = fhm.create("wb+",suffix=".ark")
						target.save(targetTemp)
						newValues.append(f"ark:{targetTemp.name}")

					elif isinstance(target,BytesArchive):
						targetTemp = fhm.create("wb+")
						target.save(targetTemp)	
						newValues.append(f"{targetTemp.name}")

					else:
						raise UnsupportedType(f"<target> should be IndexTable,ListTable,Transcription,file,int or float values or exkaldi achieve object but got: {type_name(target)}.")
				
				newResources[key] = newValues
			
			newResources["outFile"] = outFiles
			# assign these resources to each process and generate multiple commands
			parallelResources = []
			for i in range(parallel):
				parallelResources.append({})
				for key,items in newResources.items():
					parallelResources[-1][key] = items[i]
			cmds = [ cmdPattern.format(**re) for re in parallelResources ]
			# run
			flags = run_shell_command_parallel(cmds,timeout=timeout)

			finalResult = []
			done = True
			for index,info in enumerate(flags):
				cod,err = info
				if analyzeResult and cod != 0:
					print(f"{index}/{len(flags)} error tracking")
					print(err.decode())
					done = False	
				finalResult.append( (cod,err,outFiles[index]) )

			if analyzeResult and (not done):
				finalCmd = ",".join([cmd.strip().split(maxsplit=1)[0] for cmd in cmds[0].split("|")])
				raise KaldiProcessError(f"Failed to run Kaldi command: {finalCmd}. Look the error messages above.")
			else:
				if generateArchive is not None:
					for i,fileName in enumerate(outFiles):
						finalResult[i] = load_index_table(fileName,name=archiveNames[i],useSuffix="ark")

			return finalResult
Ejemplo n.º 10
0
def train_ngrams_srilm(lexicons, order, textFile, outFile, config=None):
    '''
	Train n-grams language model with Srilm tookit.

	Args:
		<lexicons>: words.txt file path or Exkaldi LexiconBank object.
		<order>: the maxinum order of n-grams.
		<textFile>: text corpus file.
		<outFile>: ARPA out file name.
		<config>: configures, a Python dict object.

	You can use .check_config("train_ngrams_srilm") function to get configure information that you can set.
	Also you can run shell command "lmplz" to look their meaning.
	'''
    assert isinstance(
        order, int
    ) and order > 0 and order < 10, "Expected <n> is a positive int value and it must be smaller than 10."
    assert isinstance(textFile,
                      str), "Expected <textFile> is name-like string."
    assert isinstance(outFile, str), "Expected <outFile> is name-like string."
    assert type_name(
        lexicons
    ) == "LexiconBank", f"Expected <lexicons> is exkaldi LexiconBank object but got {type_name(lexicons)}."

    ExkaldiInfo.prepare_srilm()

    if not os.path.isfile(textFile):
        raise WrongPath(f"No such file:{textFile}")
    else:
        ## Should check the numbers of lines
        cmd = f"shuf {textFile} -n 100"
        out, err, cod = run_shell_command(cmd,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE)
        if (isinstance(cod, int) and cod != 0):
            print(err.decode())
            raise ShellProcessError("Failed to sample from text file.")
        elif out == b'':
            raise WrongDataFormat("Void text file.")
        else:
            out = out.decode().strip().split("\n")
            spaceCount = 0
            for line in out:
                spaceCount += line.count(" ")
            if spaceCount < len(out) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or extremely short."
                )

    wordlist = tempfile.NamedTemporaryFile("w+",
                                           encoding='utf-8',
                                           suffix=".txt")
    unkSymbol = lexicons("oov")
    try:
        lexiconp = lexicons("lexiconp")
        words = [x[0] for x in lexiconp.keys()]
        wordlist.write("\n".join(words))
        wordlist.seek(0)

        #cmd2 = f"ngram-count -text {textFile} -order {order}"
        extraConfig = " "
        specifyDiscount = False
        if config is not None:
            if check_config(name='train_ngrams_srilm', config=config):
                for key, value in config.items():
                    if isinstance(value, bool):
                        if value is True:
                            extraConfig += f"{key} "
                        if key.endswith("discount"):
                            specifyDiscount = True
                    else:
                        extraConfig += f" {key} {value}"

        cmd = f"ngram-count -text {textFile} -order {order} -limit-vocab -vocab {wordlist.name} -unk -map-unk {unkSymbol} "
        if specifyDiscount is False:
            cmd += "-kndiscount "
        cmd += "-interpolate "

        if not outFile.rstrip().endswith(".arpa"):
            outFile += ".arpa"
        make_dependent_dirs(outFile, pathIsFile=True)

        cmd += f" -lm {outFile}"

        out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0:
            print(err.decode())
            if os.path.isfile(outFile):
                os.remove(outFile)
            raise KaldiProcessError(
                f'Failed to generate ngrams language model.')
        else:
            return os.path.abspath(outFile)

    finally:
        wordlist.close()
Ejemplo n.º 11
0
def train_ngrams_kenlm(lexicons, order, textFile, outFile, config=None):
    '''
	Train n-grams language model with KenLm tookit.

	Args:
		<lexicons>: words.txt file path or Exkaldi LexiconBank object.
		<order>: the maxinum order of n-grams.
		<textFile>: text corpus file.
		<outFile>: ARPA out file name.
		<config>: configures, a Python dict object.

	You can use .check_config("train_ngrams_kenlm") function to get configure information that you can set.
	Also you can run shell command "lmplz" to look their meaning.
	'''
    assert isinstance(
        order, int
    ) and 0 < order <= 6, "We support maximum 6-grams LM in current version."

    if not os.path.isfile(textFile):
        raise WrongPath("No such file:{}".format(textFile))
    else:
        ## Should check the numbers of lines
        cmd = f"shuf {textFile} -n 100"
        out, err, cod = run_shell_command(cmd,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE)
        if (isinstance(cod, int) and cod != 0):
            print(err.decode())
            raise ShellProcessError("Failed to sample from text file.")
        elif out == b'':
            raise WrongDataFormat("Void text file.")
        else:
            out = out.decode().strip().split("\n")
            spaceCount = 0
            for line in out:
                spaceCount += line.count(" ")
            if spaceCount < len(out) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or extremely short."
                )

    extraConfig = " "
    if config != None:
        assert isinstance(
            config, dict
        ), f"<config> should be dict object but got: {type_name(config)}."
        if check_config(name='train_ngrams_kenlm', config=config):
            if "--temp_prefix" in config.keys() and "-T" in config.keys():
                raise WrongOperation(
                    f'"--temp_prefix" and "-T" is the same configure so only one of them is expected.'
                )
            if "--memory" in config.keys() and "-S" in config.keys():
                raise WrongOperation(
                    f'"--memory" and "-S" is the same configure so only one of them is expected.'
                )
            for key, value in config.items():
                if isinstance(value, bool):
                    if value is True:
                        extraConfig += f"{key} "
                else:
                    extraConfig += f"{key} {value} "

    assert isinstance(outFile, str), f"<outFile> should be a string."
    if not outFile.rstrip().endswith(".arpa"):
        outFile += ".arpa"
    make_dependent_dirs(outFile, pathIsFile=True)

    words = tempfile.NamedTemporaryFile("w+", suffix=".txt", encoding="utf-8")
    try:
        if type_name(lexicons) == "LexiconBank":
            ws = lexicons("words")
            words_count = math.ceil(len(ws) / 10) * 10
            ws = "\n".join(ws.keys())
        elif isinstance(lexicons, str):
            if not os.path.isfile(lexicons):
                raise WrongPath(f"No such file:{lexicons}.")
            with open(lexicons, "r", encoding="utf-8") as fr:
                lines = fr.readlines()
            ws = []
            for line in lines:
                line = line.strip().split(maxsplit=1)
                if len(line) < 1:
                    continue
                else:
                    ws.append(line[0])
            words_count = math.ceil(len(ws) / 10) * 10
            ws = "\n".join(ws)
        else:
            raise UnsupportedType(
                "<lexicons> should be LexiconBank object or file path.")

        words.write(ws)
        words.seek(0)

        KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz")

        cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {textFile} --arpa {outFile} --limit_vocab_file {words.name}"
        out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                 == 0):
            print(err.decode())
            raise KenlmProcessError("Failed to generate arpa file.")
        else:
            return os.path.abspath(outFile)

    finally:
        words.close()