def load_list_table(target, name="listTable"): ''' Generate a list table object from dict object or file. Args: <target>: dict object or a file path. Return: a ListTable object. ''' declare.is_classes("target", target, [dict, ListTable, str]) newTable = ListTable(name=name) if type_name(target) in ["dict", "ListTable"]: newTable.update(target) return newTable else: files = list_files(target) for filePath in files: with open(filePath, "r", encoding="utf-8") as fr: lines = fr.readlines() for index, line in enumerate(lines, start=1): t = line.strip().split(maxsplit=1) if len(t) < 2: raise WrongDataFormat( f"Line Number: {index}\n" + f"Line Content: {line}\n" + f"Missing paired key and value information in file:{filePath}." ) else: newTable[t[0]] = t[1] return newTable
def load_lat(target, name="lat"): ''' Load lattice data. Args: <target>: bytes object, file path or exkaldi lattice object. <hmm>: file path or exkaldi HMM object. <wordSymbol>: file path or exkaldi LexiconBank object. <name>: a string. Return: A exkaldi lattice object. ''' if isinstance(target, bytes): return Lattice(target, name) elif isinstance(target, str): target = list_files(target) allData = [] for fileName in target: if fileName.endswith('.gz'): cmd = 'gunzip -c {}'.format(fileName) out, err, _ = run_shell_command(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if out == b'': print(err.decode()) raise WrongDataFormat('Failed to load Lattice.') else: allData.append(out) else: try: with open(fileName, 'rb') as fr: out = fr.read() except Exception as e: print("Load lattice file defeated. Please make sure it is a lattice file avaliable.") raise e else: allData.append(out) try: allData = b"".join(allData) except Exception as e: raise WrongOperation("Only support binary format lattice file.") else: return Lattice(data=allData, name=name) else: raise UnsupportedType(f"Expected bytes object or lattice file but got: {type_name(target)}.")
def load_args(target): ''' Load arguments from file. Args: <target>:file path. Return: an ArgumentParser object. ''' fileName = list_files(target) assert len(fileName) == 1, "Cannot load auguments from multiple files." global args args.reset() args.load(fileName[0]) return args
def __read_data_from_file(fileName, useSuffix=None): ''' Read data from file. If the file suffix is unknown, <useSuffix> should be assigned. ''' if useSuffix != None: assert isinstance(useSuffix, str), "Expected <useSuffix> is a string." useSuffix = useSuffix.strip().lower()[-3:] else: useSuffix = "" assert useSuffix in [ "", "scp", "ark", "npy" ], f'Expected <useSuffix> is "ark", "scp" or "npy" but got "{useSuffix}".' if isinstance(fileName, str): if os.path.isdir(fileName): raise WrongOperation( f"Expected file name but got a directory:{fileName}.") else: allFiles = list_files(fileName) else: raise UnsupportedType( f'Expected <fileName> is file name-like string but got a {type_name(fileName)}.' ) allData_bytes = BytesMatrix() allData_numpy = NumpyMatrix() def loadNpyFile(fileName): try: temp = np.load(fileName, allow_pickle=True) data = {} #totalSize = 0 for utt_mat in temp: data[utt_mat[0]] = utt_mat[1] #totalSize += sys.getsizeof(utt_mat[1]) #if totalSize > 10000000000: # print('Warning: Data is extramely large. It could not be used correctly sometimes.') except: raise UnsupportedType( f'Expected "npy" data with exkaldi format but got {fileName}.') else: return NumpyMatrix(data) def loadArkScpFile(fileName, suffix): ExkaldiInfo.vertify_kaldi_existed() if suffix == "ark": cmd = 'copy-feats ark:' else: cmd = 'copy-feats scp:' cmd += '{} ark:-'.format(fileName) out, err, cod = run_shell_command(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0) or out == b'': print(err.decode()) raise KaldiProcessError('Copy feat defeated.') else: #if sys.getsizeof(out) > 10000000000: # print('Warning: Data is extramely large. It could not be used correctly sometimes.') return BytesMatrix(out) for fileName in allFiles: sfx = fileName[-3:].lower() if sfx == "npy": allData_numpy += loadNpyFile(fileName) elif sfx in ["ark", "scp"]: allData_bytes += loadArkScpFile(fileName, sfx) elif useSuffix == "npy": allData_numpy += loadNpyFile(fileName) elif useSuffix in ["ark", "scp"]: allData_bytes += loadArkScpFile(fileName, useSuffix) else: raise UnsupportedType( 'Unknown file suffix. You can assign the <useSuffix> with "scp", "ark" or "npy".' ) if useSuffix == "": if allFiles[0][-3:].lower() == "npy": result = allData_numpy + allData_bytes.to_numpy() else: result = allData_bytes + allData_numpy.to_bytes() elif useSuffix == "npy": result = allData_numpy + allData_bytes.to_numpy() else: result = allData_bytes + allData_numpy.to_bytes() result.check_format() return result
def load_ali(target, aliType=None, name="ali", hmm=None): ''' Load alignment data. Args: <target>: Python dict object, bytes object, exkaldi alignment object, kaldi alignment file or .npy file. <aliType>: None, or one of 'transitionID', 'phoneID', 'pdfID'. It will return different alignment object. <name>: a string. <hmm>: file path or exkaldi HMM object. Return: exkaldi alignment data objects. ''' assert isinstance( name, str) and len(name) > 0, "Name shoud be a string avaliable." ExkaldiInfo.vertify_kaldi_existed() def transform(data, cmd): out, err, cod = run_shell_command(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, inputs=data) if (isinstance(cod, int) and cod != 0) and out == b'': print(err.decode()) raise KaldiProcessError('Failed to transform alignment.') else: result = {} sp = BytesIO(out) for line in sp.readlines(): line = line.decode() line = line.strip().split() utt = line[0] matrix = np.array(line[1:], dtype=np.int32) result[utt] = matrix return results if isinstance(target, dict): if aliType is None: result = NumpyAlignment(target, name) elif aliType == "transitionID": result = NumpyAlignmentTrans(target, name) elif aliType == "phoneID": result = NumpyAlignmentPhone(target, name) elif aliType == "pdfID": result = NumpyAlignmentPdf(target, name) else: raise WrongOperation( f"<aliType> should be None, 'transitionID', 'phoneID' or 'pdfID' but got {aliType}." ) result.check_format() return result elif type_name(target) in [ "NumpyAlignment", "NumpyAlignmentTrans", "NumpyAlignmentPhone", "NumpyAlignmentPdf", "BytesAlignmentTrans" ]: result = copy.deepcopy(target) result.rename(name) return result elif isinstance(target, str): allFiles = list_files(target) results = { "NumpyAlignment": NumpyAlignment(), "NumpyAlignmentTrans": NumpyAlignmentTrans(), "NumpyAlignmentPhone": NumpyAlignmentPhone(), "NumpyAlignmentPdf": NumpyAlignmentPdf(), "BytesAlignmentTrans": BytesAlignmentTrans(), } for fileName in allFiles: fileName = os.path.abspath(fileName) if fileName.endswith(".npy"): temp = __read_data_from_file(fileName, "npy") if aliType is None: temp = NumpyAlignment(temp.data) results["NumpyAlignment"] += temp elif aliType == "transitionID": temp = NumpyAlignmentTrans(temp.data) results["NumpyAlignmentTrans"] += temp elif aliType == "phoneID": temp = NumpyAlignmentPhone(temp.data) results["NumpyAlignmentPhone"] += temp elif aliType == "pdfID": temp = NumpyAlignmentPdf(temp.data) results["NumpyAlignmentPdf"] += temp else: raise WrongOperation( f"<aliType> should be None, 'transitionID','phoneID' or 'pdfID' but got {aliType}." ) else: if fileName.endswith('.gz'): cmd = f'gunzip -c {fileName}' else: cmd = f'cat {fileName}' if aliType is None or aliType == "transitionID": out, err, cod = run_shell_command(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0) or out == b'': print(err.decode()) raise ShellProcessError( "Failed to get the alignment data from file.") else: temp = BytesAlignmentTrans(out) results["BytesAlignmentTrans"] += temp else: temp = tempfile.NamedTemporaryFile("wb+") try: if type_name(hmm) in ("HMM", "MonophoneHMM", "TriphoneHMM"): hmm.save(temp) hmmFileName = temp.name elif isinstance(hmm, str): if not os.path.isfile(hmm): raise WrongPath(f"No such file:{hmm}.") hmmFileName = hmm else: raise UnsupportedType( f"<hmm> should be a filePath or exkaldi HMM and its sub-class object. but got {type_name(hmm)}." ) if aliType == "phoneID": cmd += f" | ali-to-phones --per-frame=true {hmmFileName} ark:- ark,t:-" temp = transform(None, cmd) temp = NumpyAlignmentPhone(temp) results["NumpyAlignmentPhone"] += temp elif target == "pdfID": cmd = f" | ali-to-pdf {hmmFileName} ark:- ark,t:-" temp = transform(None, cmd) temp = NumpyAlignmentPdf(temp) results["NumpyAlignmentPdf"] += temp else: raise WrongOperation( f"<target> should be 'trainsitionID', 'phoneID' or 'pdfID' but got {target}." ) finally: temp.close() finalResult = [] for obj in results.values(): if not obj.is_void: obj.rename(name) finalResult.append(obj) if len(finalResult) == 0: raise WrongOperation( "<target> dose not include any data avaliable.") elif len(finalResult) == 1: finalResult = finalResult[0] return finalResult
def load_ali(target, aliType="transitionID", name="ali", hmm=None): ''' Load alignment data. Args: <target>: Python dict object,bytes object,exkaldi alignment object,kaldi alignment file or .npy file. <aliType>: None,or one of 'transitionID','phoneID','pdfID'. It will return different alignment object. <name>: a string. <hmm>: file path or exkaldi HMM object. Return: exkaldi alignment objects. ''' declare.is_valid_string("name", name) declare.is_instances("aliType", aliType, [None, "transitionID", "phoneID", "pdfID"]) declare.kaldi_existed() def transform(data, cmd): out, err, cod = run_shell_command(cmd, stdin="PIPE", stdout="PIPE", stderr="PIPE", inputs=data) if (isinstance(cod, int) and cod != 0) and out == b'': raise KaldiProcessError('Failed to transform alignment.', err.decode()) else: result = {} sp = BytesIO(out) for line in sp.readlines(): line = line.decode() line = line.strip().split() utt = line[0] matrix = np.array(line[1:], dtype=np.int32) result[utt] = matrix return result if isinstance(target, dict): if aliType is None: result = NumpyAli(target, name) elif aliType == "transitionID": result = NumpyAliTrans(target, name) elif aliType == "phoneID": result = NumpyAliPhone(target, name) elif aliType == "pdfID": result = NumpyAliPdf(target, name) else: raise WrongOperation( f"<aliType> should be None,'transitionID','phoneID' or 'pdfID' but got {aliType}." ) result.check_format() return result elif isinstance(target, (NumpyAli, NumpyAliTrans, BytesAliTrans)): result = copy.deepcopy(target) result.rename(name) return result elif isinstance(target, IndexTable): result = target.fetch(arkType="ali") if aliType in ["phoneID", "pdfID"]: result = result.to_numpy(aliType, hmm) result.rename(name) return result elif isinstance(target, str): allFiles = list_files(target) numpyAli = {} bytesAli = [] for fileName in allFiles: fileName = fileName.strip() if fileName.endswith(".npy"): try: temp = np.load(fileName, allow_pickle=True) numpyAli.update(temp) except: raise UnsupportedType( f'This is not a valid Exkaldi npy file: {fileName}.') else: if fileName.endswith('.gz'): cmd = f'gunzip -c {fileName}' else: cmd = f'cat {fileName}' if aliType is None or aliType == "transitionID": out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0) or out == b'': raise ShellProcessError( f"Failed to get the alignment data from file: {fileName}.", err.decode()) else: bytesAli.append(out) else: with FileHandleManager() as fhm: declare.is_potential_hmm("hmm", hmm) if not isinstance(hmm, str): hmmTemp = fhm.create("wb+") hmm.save(hmmTemp) hmm = hmmTemp.name if aliType == "phoneID": cmd += f" | ali-to-phones --per-frame=true {hmm} ark:- ark,t:-" temp = transform(None, cmd) else: cmd += f" | ali-to-pdf {hmm} ark:- ark,t:-" temp = transform(None, cmd) numpyAli.update(temp) bytesAli = b"".join(bytesAli) if aliType is None: if len(numpyAli) == 0: return BytesAliTrans(bytesAli, name=name) elif len(bytesAli) == 0: return NumpyAli(numpyAli, name=name) else: result = NumpyAliTrans(numpyAli) + BytesAliTrans(bytesAli) result.rename(name) return result elif aliType == "transitionID": if len(numpyAli) == 0: return BytesAliTrans(bytesAli, name=name) elif len(bytesAli) == 0: return NumpyAliTrans(numpyAli, name=name) else: result = NumpyAliTrans(numpyAli) + BytesAliTrans(bytesAli) result.rename(name) return result elif aliType == "phoneID": return NumpyAliPhone(numpyAli, name=name) else: return NumpyAliPdf(numpyAli, name=name) else: raise UnsupportedType( f"<target> should be dict,file name or exkaldi alignment or index table object but got: {type_name(target)}." )
def __read_data_from_file(fileName, useSuffix=None): ''' Read data from file. If the file suffix is unknown,<useSuffix> is necessary. ''' declare.kaldi_existed() if useSuffix != None: declare.is_valid_string("useSuffix", useSuffix) useSuffix = useSuffix.strip().lower()[-3:] declare.is_instances("useSuffix", useSuffix, ["ark", "scp", "npy"]) else: useSuffix = "" allFiles = list_files(fileName) allData_bytes = [] allData_numpy = {} def loadNpyFile(fileName): try: temp = np.load(fileName, allow_pickle=True) data = {} for utt_mat in temp: assert isinstance(utt_mat[0], str) and isinstance( utt_mat[1], np.ndarray) data[utt_mat[0]] = utt_mat[1] except: raise UnsupportedType( f'This is not a valid Exkaldi npy file: {fileName}.') else: return data def loadArkScpFile(fileName, suffix): declare.kaldi_existed() if suffix == "ark": cmd = 'copy-feats ark:' else: cmd = 'copy-feats scp:' cmd += '{} ark:-'.format(fileName) out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0) or out == b'': raise KaldiProcessError('Failed to read archive table.', err.decode()) else: #if sys.getsizeof(out) > 10000000000: # print('Warning: Data is extramely large. We don't recommend use load_index_table to replace it.') return out for fileName in allFiles: sfx = fileName.strip()[-3:].lower() if sfx == "npy": allData_numpy.update(loadNpyFile(fileName)) elif sfx in ["ark", "scp"]: allData_bytes.append(loadArkScpFile(fileName, sfx)) elif useSuffix == "npy": allData_numpy.update(loadNpyFile(fileName)) elif useSuffix in ["ark", "scp"]: allData_bytes.append(loadArkScpFile(fileName, sfx)) else: raise UnsupportedType( 'Unknown file suffix. You can appoint the <useSuffix> option with "scp","ark" or "npy".' ) allData_bytes = b"".join(allData_bytes) if useSuffix == "": useSuffix = allFiles[0].strip()[-3:].lower() if useSuffix == "npy": dataType = "numpy" else: dataType = "bytes" return allData_bytes, allData_numpy, dataType
def load_index_table(target, name="index", useSuffix=None): ''' Load an index table from dict,or archive table file. Args: <target>: dict object,.ark or .scp file,IndexTable object,bytes archive object. <name>: a string. <useSuffix>: "ark" or "scp". We will check the file type by its suffix. But if <target> is file path and not default suffix (ark or scp),you have to declare which type it is. Return: an exkaldi IndexTable object. ''' newTable = IndexTable(name=name) if type_name(target) == "dict": for key, value in target.items(): if isinstance(value, (list, tuple)): assert len(value) in [ 3, 4 ], f"Expected (frames,start index,data size[,file path]) but {value} does not match." newTable[key] = newTable.spec(*value) elif type_name(value) == "Index": newTable[key] = value else: raise WrongDataFormat( f"Expected list or tuple but got wrong index info format: {value}." ) return newTable elif type_name(target) == "IndexTable": newTable.update(target) return newTable elif isinstance(target, BytesArchive): newTable.update(target.indexTable) return newTable else: fileList = list_files(target) if useSuffix is not None: declare.is_valid_string("useSuffix", useSuffix) useSuffix = useSuffix.strip()[-3:].lower() declare.is_instances("useSuffix", useSuffix, ["ark", "scp"]) else: useSuffix = "" for fileName in fileList: if fileName.rstrip().endswith(".ark"): t = __read_index_table_from_ark_file(fileName) elif fileName.rstrip().endswith(".scp"): t = __read_index_table_from_scp_file(fileName) elif useSuffix == "ark": t = __read_index_table_from_ark_file(fileName) elif useSuffix == "scp": t = __read_index_table_from_scp_file(fileName) else: raise UnsupportedType( "Unknown file suffix. Specify <useSuffix> please.") newTable.update(t) return newTable
def __compute_feature(target,kaldiTool,useSuffix=None,name="feat",outFile=None): ''' The base funtion to compute feature. ''' declare.kaldi_existed() if useSuffix != None: declare.is_valid_string("useSuffix",useSuffix) useSuffix = useSuffix.strip().lower()[-3:] declare.is_instances("useSuffix",useSuffix,["scp","wav"]) else: useSuffix = "" targets,kaldiTools,useSuffixs,names,outFiles = check_multiple_resources(target,kaldiTool,useSuffix,name,outFile=outFile) # pretreatment fromSegment = False with FileHandleManager() as fhm: segments = [] for index,kaldiTool,target,useSuffix,name in zip(range(len(outFiles)),kaldiTools,targets,useSuffixs,names): declare.is_classes("target",target,["str","ListTable","WavSegment"]) declare.is_valid_string("name",name) if isinstance(target,str): allFiles = list_files(target) target = ListTable() for filePath in allFiles: filePath = filePath.strip() if filePath[-4:].lower() == ".wav": fileName = os.path.basename(filePath) uttID = fileName[0:-4].replace(".","") target[uttID] = filePath elif filePath[-4:].lower() == '.scp': target += load_list_table(filePath) elif "wav" == useSuffix: fileName = os.path.basename(filePath) uttID = fileName.replace(".","") target[uttID] = filePath elif "scp" == useSuffix: target += load_list_table(filePath) else: raise UnsupportedType('Unknown file suffix. You can declare whether <useSuffix> is "wav" or "scp".') if len(target) == 0: raise WrongDataFormat("There did not include any data to compute data in target.") targets[index] = target elif type_name(target) == "WavSegment": segTemp = fhm.create("w+",suffix=".seg",encode="utf-8") target.save(segTemp) segments.append(segTemp.name) targets[index] = target.detach_wav() fromSegment = True if fromSegment: # define the command pattern cmdPattern = "extract-segments scp:{wavFile} {segment} ark:- | {kaldiTool} ark:- ark:{outFile}" # define resources resources = {"wavFile":targets,"segment":segments,"kaldiTool":kaldiTools,"outFile":outFiles} else: # define the command pattern cmdPattern = "{kaldiTool} scp:{wavFile} ark:{outFile}" # define resources resources = {"wavFile":targets,"kaldiTool":kaldiTools,"outFile":outFiles} # Run return run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,generateArchive="feat",archiveNames=names)