def load_ngrams(target, name="gram"): ''' Load a N-Grams from arpa or binary language model file. Args: <target>: file path with suffix .arpa or .binary. Return: a KenNGrams object. ''' declare.is_file("target", target) target = target.strip() with FileHandleManager() as fhm: if target.endswith(".arpa"): modelTemp = fhm.create("wb+", suffix=".binary") arpa_to_binary(target, modelTemp.name) modelTemp.seek(0) model = KenNGrams(modelTemp.name, name=name) model._path = target elif target.endswith(".binary"): model = KenNGrams(target, name=name) else: raise UnsupportedType( f"Unknown suffix. Language model file should has a suffix .arpa or .binary but got: {target}." ) return model
def paste_feature(feats): ''' Paste feature in feature dimension. Args: <feats>: a list of feature objects. Return: a new feature object. ''' declare.kaldi_existed() assert isinstance(feats,(list,tuple)) and len(feats) > 0 for fe in feats: declare.is_feature("feats", fe) allResp = [] pastedName = [] with FileHandleManager() as fhm: for ot in feats: if isinstance(ot,BytesFeat): temp = fhm.create("wb+",suffix=".ark") ot.sort(by="utt").save(temp) allResp.append( f"ark:{temp.name}" ) elif isinstance(ot,NumpyFeat): temp = fhm.create("wb+",suffix=".ark") ot.sort(by="utt").to_bytes().save(temp) allResp.append( f"ark:{temp.name}" ) else: temp = fhm.create("w+",suffix=".scp") ot.sort(by="utt").save(temp) allResp.append( f"scp:{temp.name}" ) pastedName.append( ot.name ) allResp = " ".join(allResp) cmd = f"paste-feats {allResp} ark:-" out,err,cod = run_shell_command(cmd,stdin="PIPE",stdout="PIPE",stderr="PIPE") if cod != 0 or out == b'': raise KaldiProcessError("Failed to paste feature.",err.decode()) else: pastedName = ",".join(pastedName) pastedName = f"paste({pastedName})" # New index table need to be generated later. return BytesFeat(out,name=pastedName,indexTable=None)
def __read_index_table_from_scp_file(fileName): ''' Read index table from scp file. ''' newTable = IndexTable() with FileHandleManager() as fhm: fr = fhm.open(fileName, "r", encoding="utf-8") lines = fr.readlines() for lineID, lineTxt in enumerate(lines): line = lineTxt.strip().split() if len(line) == 0: continue elif len(line) == 1: raise WrongDataFormat( f"line {lineID}: {lineTxt}\n" + "Missed complete utterance-filepath information.") elif len(line) > 2: raise WrongDataFormat( "We don't support reading index table from binary data generated via PIPE line. The second value should be ark file path and the shift." ) else: uttID = line[0] line = line[1].split(":") if len(line) != 2: raise WrongDataFormat( f"line {lineID}: {lineTxt}\n" + "Missed complete file path and shift value information." ) arkFileName = line[0] startIndex = int(line[1]) - 1 - len(uttID) fr = fhm.call(arkFileName) if fr is None: fr = fhm.open(arkFileName, "rb") fr.seek(startIndex) _, frames, dataSize = __read_one_record_from_ark(fr) arkFileName = os.path.abspath(arkFileName) newTable[uttID] = newTable.spec(frames, startIndex, dataSize, arkFileName) return newTable
def load_ali(target, aliType="transitionID", name="ali", hmm=None): ''' Load alignment data. Args: <target>: Python dict object,bytes object,exkaldi alignment object,kaldi alignment file or .npy file. <aliType>: None,or one of 'transitionID','phoneID','pdfID'. It will return different alignment object. <name>: a string. <hmm>: file path or exkaldi HMM object. Return: exkaldi alignment objects. ''' declare.is_valid_string("name", name) declare.is_instances("aliType", aliType, [None, "transitionID", "phoneID", "pdfID"]) declare.kaldi_existed() def transform(data, cmd): out, err, cod = run_shell_command(cmd, stdin="PIPE", stdout="PIPE", stderr="PIPE", inputs=data) if (isinstance(cod, int) and cod != 0) and out == b'': raise KaldiProcessError('Failed to transform alignment.', err.decode()) else: result = {} sp = BytesIO(out) for line in sp.readlines(): line = line.decode() line = line.strip().split() utt = line[0] matrix = np.array(line[1:], dtype=np.int32) result[utt] = matrix return result if isinstance(target, dict): if aliType is None: result = NumpyAli(target, name) elif aliType == "transitionID": result = NumpyAliTrans(target, name) elif aliType == "phoneID": result = NumpyAliPhone(target, name) elif aliType == "pdfID": result = NumpyAliPdf(target, name) else: raise WrongOperation( f"<aliType> should be None,'transitionID','phoneID' or 'pdfID' but got {aliType}." ) result.check_format() return result elif isinstance(target, (NumpyAli, NumpyAliTrans, BytesAliTrans)): result = copy.deepcopy(target) result.rename(name) return result elif isinstance(target, IndexTable): result = target.fetch(arkType="ali") if aliType in ["phoneID", "pdfID"]: result = result.to_numpy(aliType, hmm) result.rename(name) return result elif isinstance(target, str): allFiles = list_files(target) numpyAli = {} bytesAli = [] for fileName in allFiles: fileName = fileName.strip() if fileName.endswith(".npy"): try: temp = np.load(fileName, allow_pickle=True) numpyAli.update(temp) except: raise UnsupportedType( f'This is not a valid Exkaldi npy file: {fileName}.') else: if fileName.endswith('.gz'): cmd = f'gunzip -c {fileName}' else: cmd = f'cat {fileName}' if aliType is None or aliType == "transitionID": out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0) or out == b'': raise ShellProcessError( f"Failed to get the alignment data from file: {fileName}.", err.decode()) else: bytesAli.append(out) else: with FileHandleManager() as fhm: declare.is_potential_hmm("hmm", hmm) if not isinstance(hmm, str): hmmTemp = fhm.create("wb+") hmm.save(hmmTemp) hmm = hmmTemp.name if aliType == "phoneID": cmd += f" | ali-to-phones --per-frame=true {hmm} ark:- ark,t:-" temp = transform(None, cmd) else: cmd += f" | ali-to-pdf {hmm} ark:- ark,t:-" temp = transform(None, cmd) numpyAli.update(temp) bytesAli = b"".join(bytesAli) if aliType is None: if len(numpyAli) == 0: return BytesAliTrans(bytesAli, name=name) elif len(bytesAli) == 0: return NumpyAli(numpyAli, name=name) else: result = NumpyAliTrans(numpyAli) + BytesAliTrans(bytesAli) result.rename(name) return result elif aliType == "transitionID": if len(numpyAli) == 0: return BytesAliTrans(bytesAli, name=name) elif len(bytesAli) == 0: return NumpyAliTrans(numpyAli, name=name) else: result = NumpyAliTrans(numpyAli) + BytesAliTrans(bytesAli) result.rename(name) return result elif aliType == "phoneID": return NumpyAliPhone(numpyAli, name=name) else: return NumpyAliPdf(numpyAli, name=name) else: raise UnsupportedType( f"<target> should be dict,file name or exkaldi alignment or index table object but got: {type_name(target)}." )
def __compute_feature(target,kaldiTool,useSuffix=None,name="feat",outFile=None): ''' The base funtion to compute feature. ''' declare.kaldi_existed() if useSuffix != None: declare.is_valid_string("useSuffix",useSuffix) useSuffix = useSuffix.strip().lower()[-3:] declare.is_instances("useSuffix",useSuffix,["scp","wav"]) else: useSuffix = "" targets,kaldiTools,useSuffixs,names,outFiles = check_multiple_resources(target,kaldiTool,useSuffix,name,outFile=outFile) # pretreatment fromSegment = False with FileHandleManager() as fhm: segments = [] for index,kaldiTool,target,useSuffix,name in zip(range(len(outFiles)),kaldiTools,targets,useSuffixs,names): declare.is_classes("target",target,["str","ListTable","WavSegment"]) declare.is_valid_string("name",name) if isinstance(target,str): allFiles = list_files(target) target = ListTable() for filePath in allFiles: filePath = filePath.strip() if filePath[-4:].lower() == ".wav": fileName = os.path.basename(filePath) uttID = fileName[0:-4].replace(".","") target[uttID] = filePath elif filePath[-4:].lower() == '.scp': target += load_list_table(filePath) elif "wav" == useSuffix: fileName = os.path.basename(filePath) uttID = fileName.replace(".","") target[uttID] = filePath elif "scp" == useSuffix: target += load_list_table(filePath) else: raise UnsupportedType('Unknown file suffix. You can declare whether <useSuffix> is "wav" or "scp".') if len(target) == 0: raise WrongDataFormat("There did not include any data to compute data in target.") targets[index] = target elif type_name(target) == "WavSegment": segTemp = fhm.create("w+",suffix=".seg",encode="utf-8") target.save(segTemp) segments.append(segTemp.name) targets[index] = target.detach_wav() fromSegment = True if fromSegment: # define the command pattern cmdPattern = "extract-segments scp:{wavFile} {segment} ark:- | {kaldiTool} ark:- ark:{outFile}" # define resources resources = {"wavFile":targets,"segment":segments,"kaldiTool":kaldiTools,"outFile":outFiles} else: # define the command pattern cmdPattern = "{kaldiTool} scp:{wavFile} ark:{outFile}" # define resources resources = {"wavFile":targets,"kaldiTool":kaldiTools,"outFile":outFiles} # Run return run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,generateArchive="feat",archiveNames=names)
def wer(ref, hyp, ignore=None, mode='all'): ''' Compute WER (word error rate) between <ref> and <hyp>. Args: <ref>,<hyp>: exkaldi transcription object or file path. <ignore>: ignore a symbol. <mode>: "all" or "present". Return: a namedtuple of score information. ''' declare.is_potential_transcription("ref", ref) declare.is_potential_transcription("hyp", hyp) declare.is_instances("mode", mode, ['all', 'present']) declare.kaldi_existed() if ignore is not None: declare.is_valid_string("ignore", ignore) with FileHandleManager() as fhm: if ignore is None: if type_name(hyp) == "Transcription": hypTemp = fhm.create("w+", suffix=".txt", encoding="utf-8") hyp.save(hypTemp) hyp = hypTemp.name if type_name(ref) == "Transcription": refTemp = fhm.create("w+", suffix=".txt", encoding="utf-8") ref.save(refTemp) ref = refTemp.name cmd = f'compute-wer --text --mode={mode} ark:{ref} ark,p:{hyp}' scoreOut, scoreErr, _ = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") else: # remove the ingored symbol in hyp if type_name(hyp) == "Transcription": hyp = hyp.save() else: with open(hyp, "r", encoding="utf-8") as fr: hyp = fr.read() hypTemp = fhm.create("w+", suffix=".txt", encoding="utf-8") cmd = f'sed "s/{ignore} //g" > {hypTemp.name}' hypOut, err, _ = run_shell_command(cmd, stdin="PIPE", stdout="PIPE", stderr="PIPE", inputs=hyp) if len(hypOut) == 0: raise WrongDataFormat("<hyp> has wrong data formation.", err.decode()) # remove the ingored symbol in ref if type_name(ref) == "Transcription": ref = ref.save() else: with open(ref, "r", encoding="utf-8") as fr: ref = fr.read() refTemp = fhm.create("w+", suffix=".txt", encoding="utf-8") cmd = f'sed "s/{ignore} //g" > {refTemp.name}' refOut, err, cod = run_shell_command(cmd, stdin="PIPE", stdout="PIPE", stderr="PIPE", inputs=ref) if cod != 0 or len(refOut) == 0: raise WrongDataFormat("<ref> has wrong data formation.", err.decode()) # score cmd = f'compute-wer --text --mode={mode} ark:{refTemp.name} ark,p:{hypTemp.name}' scoreOut, scoreErr, _ = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if len(scoreOut) == 0: raise KaldiProcessError("Failed to compute WER.", scoreErr.decode()) else: out = scoreOut.decode().split("\n") pattern1 = '%WER (.*) \[ (.*) \/ (.*),(.*) ins,(.*) del,(.*) sub \]' pattern2 = "%SER (.*) \[ (.*) \/ (.*) \]" pattern3 = "Scored (.*) sentences,(.*) not present in hyp." s1 = re.findall(pattern1, out[0])[0] s2 = re.findall(pattern2, out[1])[0] s3 = re.findall(pattern3, out[2])[0] return namedtuple("Score", [ "WER", "words", "insErr", "delErr", "subErr", "SER", "sentences", "wrongSentences", "missedSentences" ])( float(s1[0]), #WER int(s1[2]), #words int(s1[3]), #ins int(s1[4]), #del int(s1[5]), #sub float(s2[0]), #SER int(s2[1]), #sentences int(s2[2]), #wrong sentences int(s3[1]) #missed sentences )
def train_ngrams_srilm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. If you don't specified the discount by the <config> option, We defaultly use "kndiscount". Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_srilm") function to get a reference of extra configurations. Also you can run shell command "ngram-count" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) # verify the max order declare.less_equal("order", order, "max order", 9) # prepare srilm tool ExkaldiInfo.prepare_srilm() with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name unkSymbol = lexicons("oov") wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) extraConfig = " " specifyDiscount = False if config is not None: if check_config(name='train_ngrams_srilm', config=config): for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " if key.endswith("discount"): specifyDiscount = True else: extraConfig += f" {key} {value}" cmd = f'ngram-count -text {text} -order {order} -limit-vocab -vocab {wordlistTemp.name} -unk -map-unk "{unkSymbol}" ' if specifyDiscount is False: cmd += "-kndiscount " cmd += "-interpolate " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) cmd += f" -lm {outFile}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0: print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KaldiProcessError( f'Failed to generate N-Grams language model.') return outFile
def train_ngrams_kenlm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_kenlm") function to get a reference of extra configurations. Also you can run shell command "lmplz" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) declare.less_equal("order", order, "max order", 9) with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name extraConfig = " " if config is not None: if check_config(name='train_ngrams_kenlm', config=config): if "--temp_prefix" in config.keys() and "-T" in config.keys(): raise WrongOperation( f'"--temp_prefix" and "-T" is the same configuration so only one of them is expected.' ) if "--memory" in config.keys() and "-S" in config.keys(): raise WrongOperation( f'"--memory" and "-S" is the same configuration so only one of them is expected.' ) for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " else: extraConfig += f"{key} {value} " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words_count = math.ceil(len(words) / 10) * 10 words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz") cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {text} --arpa {outFile} --limit_vocab_file {wordlistTemp.name}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KenlmProcessError("Failed to generate arpa file.") return outFile
def run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,timeout=ExKaldiInfo.timeout,generateArchive=None,archiveNames=None): ''' Map resources to command pattern and run this command parallelly. Args: <resources>: a dict whose keys are the name of resource and values are lists of resources objects. For example: {"feat": [BytesFeat01,BytesFeat02,... ],"outFile":{"newFeat01.ark","newFeat02.ark",...} }. The "outFile" resource is necessary. When there is only one process to run,"outFile" can be "-" which means the standard output stream. <cmdPattern>: a string needed to map the resources. For example: "copy-feat {feat} ark:{outFile}". Return: a list of triples: (return code,error info,output file or buffer) ''' declare.kaldi_existed() declare.is_classes("resources",resources,dict) declare.is_classes("cmdPattern",cmdPattern,str) assert "outFile" in resources.keys(),"<outFile> key and value is necessary in recources." declare.members_are_classes("the values of resources",resources.values(),[list,tuple]) if generateArchive is not None: analyzeResult = True #forcely analyze the result # check the format of cmomand pattern nameIndexs = [ i for i,c in enumerate(cmdPattern) if c == "{" or c == "}" ] assert len(nameIndexs)%2 == 0,f"The numbers of braces do not match in command pattern: '{cmdPattern}'. " auxiliaryInfo = {} for i in range(0,len(nameIndexs),2): name = cmdPattern[nameIndexs[i]+1:nameIndexs[i+1]] if name not in resources: raise WrongDataFormat(f"Resource is necessary but has not been provided: {name}.") prefix = "" if nameIndexs[i] == 0 else cmdPattern[nameIndexs[i]-1] if name in auxiliaryInfo.keys(): auxiliaryInfo[name][0] += 1 if not prefix in auxiliaryInfo[name][1]: auxiliaryInfo[name][1] += prefix else: auxiliaryInfo[name] = [1,prefix] assert "outFile" in auxiliaryInfo.keys(),"Key: <outFile> is necessary in command pattern." _outFileCountInfo = auxiliaryInfo.pop("outFile") assert _outFileCountInfo[0] == 1,f"Only allow <outFile> appear one time in command pattern but: {_outFileCountInfo[0]}." outFiles = resources.pop("outFile") for outFile in outFiles: if outFile != "-": make_dependent_dirs(outFile,pathIsFile=True) parallel = len(outFiles) if generateArchive is not None: declare.is_instances("generateArchive",generateArchive,["feat","cmvn","ali","fmllr"]) if archiveNames is None: archiveNames = [ generateArchive for i in range(parallel)] elif isinstance(archiveNames,str): archiveNames = [ archiveNames for i in range(parallel)] elif isinstance(archiveNames,(list,tuple)): declare.equal("the number of achieve names",len(archiveNames),"parallel",parallel) else: raise UnsupportedType(f"<archiveNames> should be string or list or tuple but got: {type_name(archiveNames)}.") # regulate resources and run with FileHandleManager() as fhm: newResources = {} if parallel == 1: # Detect whether there is PIPE in command pattern. testPlaceholder = dict( (key,value[0]) if isinstance(value[0],str) else (key,"placeholder") for key,value in resources.items() ) testPlaceholder["outFile"] = "placeholder" testCmd = cmdPattern.format(**testPlaceholder) if "|" in testCmd: inputsBuffer = False else: inputsBuffer = True del testPlaceholder # regularate resources for key,countPrefix in auxiliaryInfo.items(): count,prefix = countPrefix target = resources[key][0] # If target is a list-table,we can not automatically decide whether it is scp-format or ark-format. # So you should appoint it in the command parttern. if type_name(target) in ["ListTable","Transcription"]: if prefix not in [":","="]: errMes = f"There might miss prefix such as 'ark:' or 'scp:' or '--option=' in command pattern before resource: {key}." errMes += "Check the command line please. If you still think there dose not need the prefix," errMes += "save this ListTable or Transcription into file and instead it will this file name." errMes += "In that case,we will skip checking the prefix." raise WrongOperation(errMes) target = target.sort() if (inputsBuffer is True) and count == 1: inputsBuffer = target.save() newResources[key] = "-" else: targetTemp = fhm.create("w+",encoding="utf-8") target.save(targetTemp) newResources[key] = f"{targetTemp.name}" # If target is an index-table,we automatically recognize it as scp-file,so you do not need appoint it. elif type_name(target) == "IndexTable": if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() if (inputsBuffer is True) and count == 1: inputsBuffer = target.save() newResources[key] = "scp:-" else: targetTemp = fhm.create("w+",suffix=".scp",encoding="utf-8") target.save(targetTemp) newResources[key] = f"scp:{targetTemp.name}" elif isinstance(target,(str,int,float)): # file or other value parameter newResources[key] = f"{target}" elif isinstance(target,(BytesMatrix,BytesVector)): if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() if (inputsBuffer is True) and count == 1: inputsBuffer = target.data newResources[key] = "ark:-" else: targetTemp = fhm.create("wb+",suffix=".ark") target.save(targetTemp) newResources[key] = f"ark:{targetTemp.name}" elif isinstance(target,(NumpyMatrix,NumpyVector)): if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() if (inputsBuffer is True) and count == 1: inputsBuffer = target.to_bytes().data newResources[key] = "ark:-" else: target = target.to_bytes() targetTemp = fhm.create("wb+",suffix=".ark") target.save(targetTemp) newResources[key] = f"ark:{targetTemp.name}" elif isinstance(target,BytesArchive): if (inputsBuffer is True) and count == 1: inputsBuffer = target.data newResources[key] = "-" else: targetTemp = fhm.create("wb+") target.save(targetTemp) newResources[key] = f"{targetTemp.name}" else: raise UnsupportedType(f"<target> should be IndexTable,ListTable,file name,int or float value,or exkaldi achieve object but got: {type_name(target)}.") # Then,process output stream outFile = outFiles[0] newResources["outFile"] = outFile inputsBuffer = None if isinstance(inputsBuffer,bool) else inputsBuffer # Then rum command finalCmd = cmdPattern.format(**newResources) out,err,cod = run_shell_command(finalCmd,stdin="PIPE",stdout="PIPE",stderr="PIPE",inputs=inputsBuffer) if analyzeResult: if cod != 0: finalCmd = ",".join([cmd.strip().split(maxsplit=1)[0] for cmd in finalCmd.split("|")]) raise KaldiProcessError(f"Failed to run Kaldi command: {finalCmd}.",err.decode()) if outFile == "-": if generateArchive is not None: if generateArchive == "feat": out = BytesFeat(data=out,name=archiveNames[0]) elif generateArchive == "ali": out = BytesAliTrans(data=out,name=archiveNames[0]) elif generateArchive == "cmvn": out = BytesCMVN(data=out,name=archiveNames[0]) else: out = BytesFmllr(data=out,name=archiveNames[0]) return out else: return (cod,err,out) else: if generateArchive is not None: return load_index_table(outFile,name=archiveNames[0],useSuffix="ark") else: return (cod,err,outFile) else: # In this case,all input IO stream must be files. for key,countPrefix in auxiliaryInfo.items(): count,prefix = countPrefix values = resources[key] newValues = [] for target in values: # If target is scp resource if type_name(target) in ["ListTable","Transcription"]: if prefix not in [":","="]: errMes = f"There might miss prefix such as 'ark:' or 'scp:' or '--option=' in command pattern before resource: {key}." errMes += "Check the command line please. If you still think there dose not need the prefix," errMes += "save this ListTable or Transcription into file and instead it will this file name." errMes += "In that case,we will skip checking the prefix." raise WrongOperation(errMes) target = target.sort() targetTemp = fhm.create("w+",encoding="utf-8") target.save(targetTemp) newValues.append(f"{targetTemp.name}") elif type_name(target) == "IndexTable": if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() targetTemp = fhm.create("w+",suffix=".scp",encoding="utf-8") target.save(targetTemp) newValues.append(f"scp:{targetTemp.name}") elif isinstance(target,(str,float,int)): # file name or other value parameters newValues.append(f"{target}") elif isinstance(target,(BytesMatrix,BytesVector)): if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort() targetTemp = fhm.create("wb+",suffix=".ark") target.save(targetTemp) newValues.append(f"ark:{targetTemp.name}") elif isinstance(target,(NumpyMatrix,NumpyVector)): if prefix != " ": errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}." errMes += f"Because we will decide the prefix depending on its data type." raise WrongOperation(errMes) target = target.sort().to_bytes() targetTemp = fhm.create("wb+",suffix=".ark") target.save(targetTemp) newValues.append(f"ark:{targetTemp.name}") elif isinstance(target,BytesArchive): targetTemp = fhm.create("wb+") target.save(targetTemp) newValues.append(f"{targetTemp.name}") else: raise UnsupportedType(f"<target> should be IndexTable,ListTable,Transcription,file,int or float values or exkaldi achieve object but got: {type_name(target)}.") newResources[key] = newValues newResources["outFile"] = outFiles # assign these resources to each process and generate multiple commands parallelResources = [] for i in range(parallel): parallelResources.append({}) for key,items in newResources.items(): parallelResources[-1][key] = items[i] cmds = [ cmdPattern.format(**re) for re in parallelResources ] # run flags = run_shell_command_parallel(cmds,timeout=timeout) finalResult = [] done = True for index,info in enumerate(flags): cod,err = info if analyzeResult and cod != 0: print(f"{index}/{len(flags)} error tracking") print(err.decode()) done = False finalResult.append( (cod,err,outFiles[index]) ) if analyzeResult and (not done): finalCmd = ",".join([cmd.strip().split(maxsplit=1)[0] for cmd in cmds[0].split("|")]) raise KaldiProcessError(f"Failed to run Kaldi command: {finalCmd}. Look the error messages above.") else: if generateArchive is not None: for i,fileName in enumerate(outFiles): finalResult[i] = load_index_table(fileName,name=archiveNames[i],useSuffix="ark") return finalResult