Exemple #1
0
def load_list_table(target, name="listTable"):
    '''
	Generate a list table object from dict object or file.

	Args:
		<target>: dict object or a file path.
	
	Return:
		a ListTable object.
	'''
    declare.is_classes("target", target, [dict, ListTable, str])

    newTable = ListTable(name=name)
    if type_name(target) in ["dict", "ListTable"]:
        newTable.update(target)
        return newTable

    else:
        files = list_files(target)
        for filePath in files:
            with open(filePath, "r", encoding="utf-8") as fr:
                lines = fr.readlines()
            for index, line in enumerate(lines, start=1):
                t = line.strip().split(maxsplit=1)
                if len(t) < 2:
                    raise WrongDataFormat(
                        f"Line Number: {index}\n" + f"Line Content: {line}\n" +
                        f"Missing paired key and value information in file:{filePath}."
                    )
                else:
                    newTable[t[0]] = t[1]

        return newTable
Exemple #2
0
def load_lat(target, name="lat"):
	'''
	Load lattice data.

	Args:
		<target>: bytes object, file path or exkaldi lattice object.
		<hmm>: file path or exkaldi HMM object.
		<wordSymbol>: file path or exkaldi LexiconBank object.
		<name>: a string.
	Return:
		A exkaldi lattice object.
	'''
	if isinstance(target, bytes):
		return Lattice(target, name)

	elif isinstance(target, str):
		target = list_files(target)
		allData = []
		for fileName in target:
			if fileName.endswith('.gz'):
				cmd = 'gunzip -c {}'.format(fileName)
				out, err, _ = run_shell_command(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
				if out == b'':
					print(err.decode())
					raise WrongDataFormat('Failed to load Lattice.')
				else:
					allData.append(out)
			else:
				try:
					with open(fileName, 'rb') as fr:
						out = fr.read()
				except Exception as e:
					print("Load lattice file defeated. Please make sure it is a lattice file avaliable.")
					raise e
				else:
					allData.append(out)
		try:
			allData = b"".join(allData)
		except Exception as e:
			raise WrongOperation("Only support binary format lattice file.")
		else:
			return Lattice(data=allData, name=name)

	else:
		raise UnsupportedType(f"Expected bytes object or lattice file but got: {type_name(target)}.")
Exemple #3
0
def load_args(target):
	'''
	Load arguments from file.

	Args:
		<target>:file path.
	
	Return:
		an ArgumentParser object.
	'''
	fileName = list_files(target)
	assert len(fileName) == 1, "Cannot load auguments from multiple files."

	global args

	args.reset()
	args.load(fileName[0])

	return args
Exemple #4
0
def __read_data_from_file(fileName, useSuffix=None):
    '''
	Read data from file. If the file suffix is unknown, <useSuffix> should be assigned.
	'''
    if useSuffix != None:
        assert isinstance(useSuffix, str), "Expected <useSuffix> is a string."
        useSuffix = useSuffix.strip().lower()[-3:]
    else:
        useSuffix = ""

    assert useSuffix in [
        "", "scp", "ark", "npy"
    ], f'Expected <useSuffix> is "ark", "scp" or "npy" but got "{useSuffix}".'

    if isinstance(fileName, str):
        if os.path.isdir(fileName):
            raise WrongOperation(
                f"Expected file name but got a directory:{fileName}.")
        else:
            allFiles = list_files(fileName)
    else:
        raise UnsupportedType(
            f'Expected <fileName> is file name-like string but got a {type_name(fileName)}.'
        )

    allData_bytes = BytesMatrix()
    allData_numpy = NumpyMatrix()

    def loadNpyFile(fileName):
        try:
            temp = np.load(fileName, allow_pickle=True)
            data = {}
            #totalSize = 0
            for utt_mat in temp:
                data[utt_mat[0]] = utt_mat[1]
                #totalSize += sys.getsizeof(utt_mat[1])
            #if totalSize > 10000000000:
            #    print('Warning: Data is extramely large. It could not be used correctly sometimes.')
        except:
            raise UnsupportedType(
                f'Expected "npy" data with exkaldi format but got {fileName}.')
        else:
            return NumpyMatrix(data)

    def loadArkScpFile(fileName, suffix):
        ExkaldiInfo.vertify_kaldi_existed()

        if suffix == "ark":
            cmd = 'copy-feats ark:'
        else:
            cmd = 'copy-feats scp:'

        cmd += '{} ark:-'.format(fileName)
        out, err, cod = run_shell_command(cmd,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE)
        if (isinstance(cod, int) and cod != 0) or out == b'':
            print(err.decode())
            raise KaldiProcessError('Copy feat defeated.')
        else:
            #if sys.getsizeof(out) > 10000000000:
            #    print('Warning: Data is extramely large. It could not be used correctly sometimes.')
            return BytesMatrix(out)

    for fileName in allFiles:
        sfx = fileName[-3:].lower()
        if sfx == "npy":
            allData_numpy += loadNpyFile(fileName)
        elif sfx in ["ark", "scp"]:
            allData_bytes += loadArkScpFile(fileName, sfx)
        elif useSuffix == "npy":
            allData_numpy += loadNpyFile(fileName)
        elif useSuffix in ["ark", "scp"]:
            allData_bytes += loadArkScpFile(fileName, useSuffix)
        else:
            raise UnsupportedType(
                'Unknown file suffix. You can assign the <useSuffix> with "scp", "ark" or "npy".'
            )

    if useSuffix == "":
        if allFiles[0][-3:].lower() == "npy":
            result = allData_numpy + allData_bytes.to_numpy()
        else:
            result = allData_bytes + allData_numpy.to_bytes()
    elif useSuffix == "npy":
        result = allData_numpy + allData_bytes.to_numpy()
    else:
        result = allData_bytes + allData_numpy.to_bytes()

    result.check_format()
    return result
Exemple #5
0
def load_ali(target, aliType=None, name="ali", hmm=None):
    '''
	Load alignment data.

	Args:
		<target>: Python dict object, bytes object, exkaldi alignment object, kaldi alignment file or .npy file.
		<aliType>: None, or one of 'transitionID', 'phoneID', 'pdfID'. It will return different alignment object.
		<name>: a string.
		<hmm>: file path or exkaldi HMM object.
	Return:
		exkaldi alignment data objects.
	'''
    assert isinstance(
        name, str) and len(name) > 0, "Name shoud be a string avaliable."

    ExkaldiInfo.vertify_kaldi_existed()

    def transform(data, cmd):
        out, err, cod = run_shell_command(cmd,
                                          stdin=subprocess.PIPE,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE,
                                          inputs=data)
        if (isinstance(cod, int) and cod != 0) and out == b'':
            print(err.decode())
            raise KaldiProcessError('Failed to transform alignment.')
        else:
            result = {}
            sp = BytesIO(out)
            for line in sp.readlines():
                line = line.decode()
                line = line.strip().split()
                utt = line[0]
                matrix = np.array(line[1:], dtype=np.int32)
                result[utt] = matrix
            return results

    if isinstance(target, dict):
        if aliType is None:
            result = NumpyAlignment(target, name)
        elif aliType == "transitionID":
            result = NumpyAlignmentTrans(target, name)
        elif aliType == "phoneID":
            result = NumpyAlignmentPhone(target, name)
        elif aliType == "pdfID":
            result = NumpyAlignmentPdf(target, name)
        else:
            raise WrongOperation(
                f"<aliType> should be None, 'transitionID', 'phoneID' or 'pdfID' but got {aliType}."
            )
        result.check_format()
        return result

    elif type_name(target) in [
            "NumpyAlignment", "NumpyAlignmentTrans", "NumpyAlignmentPhone",
            "NumpyAlignmentPdf", "BytesAlignmentTrans"
    ]:
        result = copy.deepcopy(target)
        result.rename(name)
        return result

    elif isinstance(target, str):

        allFiles = list_files(target)

        results = {
            "NumpyAlignment": NumpyAlignment(),
            "NumpyAlignmentTrans": NumpyAlignmentTrans(),
            "NumpyAlignmentPhone": NumpyAlignmentPhone(),
            "NumpyAlignmentPdf": NumpyAlignmentPdf(),
            "BytesAlignmentTrans": BytesAlignmentTrans(),
        }

        for fileName in allFiles:
            fileName = os.path.abspath(fileName)

            if fileName.endswith(".npy"):
                temp = __read_data_from_file(fileName, "npy")
                if aliType is None:
                    temp = NumpyAlignment(temp.data)
                    results["NumpyAlignment"] += temp
                elif aliType == "transitionID":
                    temp = NumpyAlignmentTrans(temp.data)
                    results["NumpyAlignmentTrans"] += temp
                elif aliType == "phoneID":
                    temp = NumpyAlignmentPhone(temp.data)
                    results["NumpyAlignmentPhone"] += temp
                elif aliType == "pdfID":
                    temp = NumpyAlignmentPdf(temp.data)
                    results["NumpyAlignmentPdf"] += temp
                else:
                    raise WrongOperation(
                        f"<aliType> should be None, 'transitionID','phoneID' or 'pdfID' but got {aliType}."
                    )

            else:
                if fileName.endswith('.gz'):
                    cmd = f'gunzip -c {fileName}'
                else:
                    cmd = f'cat {fileName}'

                if aliType is None or aliType == "transitionID":
                    out, err, cod = run_shell_command(cmd,
                                                      stdout=subprocess.PIPE,
                                                      stderr=subprocess.PIPE)
                    if (isinstance(cod, int) and cod != 0) or out == b'':
                        print(err.decode())
                        raise ShellProcessError(
                            "Failed to get the alignment data from file.")
                    else:
                        temp = BytesAlignmentTrans(out)
                        results["BytesAlignmentTrans"] += temp

                else:
                    temp = tempfile.NamedTemporaryFile("wb+")
                    try:
                        if type_name(hmm) in ("HMM", "MonophoneHMM",
                                              "TriphoneHMM"):
                            hmm.save(temp)
                            hmmFileName = temp.name
                        elif isinstance(hmm, str):
                            if not os.path.isfile(hmm):
                                raise WrongPath(f"No such file:{hmm}.")
                            hmmFileName = hmm
                        else:
                            raise UnsupportedType(
                                f"<hmm> should be a filePath or exkaldi HMM and its sub-class object. but got {type_name(hmm)}."
                            )

                        if aliType == "phoneID":
                            cmd += f" | ali-to-phones --per-frame=true {hmmFileName} ark:- ark,t:-"
                            temp = transform(None, cmd)
                            temp = NumpyAlignmentPhone(temp)
                            results["NumpyAlignmentPhone"] += temp

                        elif target == "pdfID":
                            cmd = f" | ali-to-pdf {hmmFileName} ark:- ark,t:-"
                            temp = transform(None, cmd)
                            temp = NumpyAlignmentPdf(temp)
                            results["NumpyAlignmentPdf"] += temp
                        else:
                            raise WrongOperation(
                                f"<target> should be 'trainsitionID', 'phoneID' or 'pdfID' but got {target}."
                            )

                    finally:
                        temp.close()

        finalResult = []
        for obj in results.values():
            if not obj.is_void:
                obj.rename(name)
                finalResult.append(obj)

        if len(finalResult) == 0:
            raise WrongOperation(
                "<target> dose not include any data avaliable.")
        elif len(finalResult) == 1:
            finalResult = finalResult[0]

        return finalResult
Exemple #6
0
def load_ali(target, aliType="transitionID", name="ali", hmm=None):
    '''
	Load alignment data.

	Args:
		<target>: Python dict object,bytes object,exkaldi alignment object,kaldi alignment file or .npy file.
		<aliType>: None,or one of 'transitionID','phoneID','pdfID'. It will return different alignment object.
		<name>: a string.
		<hmm>: file path or exkaldi HMM object.

	Return:
		exkaldi alignment objects.
	'''
    declare.is_valid_string("name", name)
    declare.is_instances("aliType", aliType,
                         [None, "transitionID", "phoneID", "pdfID"])
    declare.kaldi_existed()

    def transform(data, cmd):
        out, err, cod = run_shell_command(cmd,
                                          stdin="PIPE",
                                          stdout="PIPE",
                                          stderr="PIPE",
                                          inputs=data)
        if (isinstance(cod, int) and cod != 0) and out == b'':
            raise KaldiProcessError('Failed to transform alignment.',
                                    err.decode())
        else:
            result = {}
            sp = BytesIO(out)
            for line in sp.readlines():
                line = line.decode()
                line = line.strip().split()
                utt = line[0]
                matrix = np.array(line[1:], dtype=np.int32)
                result[utt] = matrix
            return result

    if isinstance(target, dict):
        if aliType is None:
            result = NumpyAli(target, name)
        elif aliType == "transitionID":
            result = NumpyAliTrans(target, name)
        elif aliType == "phoneID":
            result = NumpyAliPhone(target, name)
        elif aliType == "pdfID":
            result = NumpyAliPdf(target, name)
        else:
            raise WrongOperation(
                f"<aliType> should be None,'transitionID','phoneID' or 'pdfID' but got {aliType}."
            )
        result.check_format()
        return result

    elif isinstance(target, (NumpyAli, NumpyAliTrans, BytesAliTrans)):
        result = copy.deepcopy(target)
        result.rename(name)
        return result

    elif isinstance(target, IndexTable):
        result = target.fetch(arkType="ali")
        if aliType in ["phoneID", "pdfID"]:
            result = result.to_numpy(aliType, hmm)
        result.rename(name)
        return result

    elif isinstance(target, str):
        allFiles = list_files(target)
        numpyAli = {}
        bytesAli = []

        for fileName in allFiles:
            fileName = fileName.strip()
            if fileName.endswith(".npy"):
                try:
                    temp = np.load(fileName, allow_pickle=True)
                    numpyAli.update(temp)
                except:
                    raise UnsupportedType(
                        f'This is not a valid Exkaldi npy file: {fileName}.')
            else:
                if fileName.endswith('.gz'):
                    cmd = f'gunzip -c {fileName}'
                else:
                    cmd = f'cat {fileName}'

                if aliType is None or aliType == "transitionID":
                    out, err, cod = run_shell_command(cmd,
                                                      stdout="PIPE",
                                                      stderr="PIPE")
                    if (isinstance(cod, int) and cod != 0) or out == b'':
                        raise ShellProcessError(
                            f"Failed to get the alignment data from file: {fileName}.",
                            err.decode())
                    else:
                        bytesAli.append(out)

                else:
                    with FileHandleManager() as fhm:
                        declare.is_potential_hmm("hmm", hmm)
                        if not isinstance(hmm, str):
                            hmmTemp = fhm.create("wb+")
                            hmm.save(hmmTemp)
                            hmm = hmmTemp.name

                        if aliType == "phoneID":
                            cmd += f" | ali-to-phones --per-frame=true {hmm} ark:- ark,t:-"
                            temp = transform(None, cmd)

                        else:
                            cmd += f" | ali-to-pdf {hmm} ark:- ark,t:-"
                            temp = transform(None, cmd)

                    numpyAli.update(temp)

        bytesAli = b"".join(bytesAli)
        if aliType is None:
            if len(numpyAli) == 0:
                return BytesAliTrans(bytesAli, name=name)
            elif len(bytesAli) == 0:
                return NumpyAli(numpyAli, name=name)
            else:
                result = NumpyAliTrans(numpyAli) + BytesAliTrans(bytesAli)
                result.rename(name)
                return result
        elif aliType == "transitionID":
            if len(numpyAli) == 0:
                return BytesAliTrans(bytesAli, name=name)
            elif len(bytesAli) == 0:
                return NumpyAliTrans(numpyAli, name=name)
            else:
                result = NumpyAliTrans(numpyAli) + BytesAliTrans(bytesAli)
                result.rename(name)
                return result
        elif aliType == "phoneID":
            return NumpyAliPhone(numpyAli, name=name)
        else:
            return NumpyAliPdf(numpyAli, name=name)

    else:
        raise UnsupportedType(
            f"<target> should be dict,file name or exkaldi alignment or index table object but got: {type_name(target)}."
        )
Exemple #7
0
def __read_data_from_file(fileName, useSuffix=None):
    '''
	Read data from file. If the file suffix is unknown,<useSuffix> is necessary.
	'''
    declare.kaldi_existed()

    if useSuffix != None:
        declare.is_valid_string("useSuffix", useSuffix)
        useSuffix = useSuffix.strip().lower()[-3:]
        declare.is_instances("useSuffix", useSuffix, ["ark", "scp", "npy"])
    else:
        useSuffix = ""

    allFiles = list_files(fileName)

    allData_bytes = []
    allData_numpy = {}

    def loadNpyFile(fileName):
        try:
            temp = np.load(fileName, allow_pickle=True)
            data = {}
            for utt_mat in temp:
                assert isinstance(utt_mat[0], str) and isinstance(
                    utt_mat[1], np.ndarray)
                data[utt_mat[0]] = utt_mat[1]
        except:
            raise UnsupportedType(
                f'This is not a valid Exkaldi npy file: {fileName}.')
        else:
            return data

    def loadArkScpFile(fileName, suffix):
        declare.kaldi_existed()

        if suffix == "ark":
            cmd = 'copy-feats ark:'
        else:
            cmd = 'copy-feats scp:'

        cmd += '{} ark:-'.format(fileName)
        out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE")
        if (isinstance(cod, int) and cod != 0) or out == b'':
            raise KaldiProcessError('Failed to read archive table.',
                                    err.decode())
        else:
            #if sys.getsizeof(out) > 10000000000:
            #    print('Warning: Data is extramely large. We don't recommend use load_index_table to replace it.')
            return out

    for fileName in allFiles:
        sfx = fileName.strip()[-3:].lower()
        if sfx == "npy":
            allData_numpy.update(loadNpyFile(fileName))
        elif sfx in ["ark", "scp"]:
            allData_bytes.append(loadArkScpFile(fileName, sfx))
        elif useSuffix == "npy":
            allData_numpy.update(loadNpyFile(fileName))
        elif useSuffix in ["ark", "scp"]:
            allData_bytes.append(loadArkScpFile(fileName, sfx))
        else:
            raise UnsupportedType(
                'Unknown file suffix. You can appoint the <useSuffix> option with "scp","ark" or "npy".'
            )

    allData_bytes = b"".join(allData_bytes)

    if useSuffix == "":
        useSuffix = allFiles[0].strip()[-3:].lower()

    if useSuffix == "npy":
        dataType = "numpy"
    else:
        dataType = "bytes"

    return allData_bytes, allData_numpy, dataType
Exemple #8
0
def load_index_table(target, name="index", useSuffix=None):
    '''
	Load an index table from dict,or archive table file.

	Args:
		<target>: dict object,.ark or .scp file,IndexTable object,bytes archive object.
		<name>: a string.
		<useSuffix>: "ark" or "scp". We will check the file type by its suffix. 
								But if <target> is file path and not default suffix (ark or scp),you have to declare which type it is.

	Return:
		an exkaldi IndexTable object.
	'''
    newTable = IndexTable(name=name)

    if type_name(target) == "dict":
        for key, value in target.items():
            if isinstance(value, (list, tuple)):
                assert len(value) in [
                    3, 4
                ], f"Expected (frames,start index,data size[,file path]) but {value} does not match."
                newTable[key] = newTable.spec(*value)
            elif type_name(value) == "Index":
                newTable[key] = value
            else:
                raise WrongDataFormat(
                    f"Expected list or tuple but got wrong index info format: {value}."
                )

        return newTable

    elif type_name(target) == "IndexTable":
        newTable.update(target)
        return newTable

    elif isinstance(target, BytesArchive):
        newTable.update(target.indexTable)
        return newTable

    else:
        fileList = list_files(target)

        if useSuffix is not None:
            declare.is_valid_string("useSuffix", useSuffix)
            useSuffix = useSuffix.strip()[-3:].lower()
            declare.is_instances("useSuffix", useSuffix, ["ark", "scp"])
        else:
            useSuffix = ""

        for fileName in fileList:

            if fileName.rstrip().endswith(".ark"):
                t = __read_index_table_from_ark_file(fileName)
            elif fileName.rstrip().endswith(".scp"):
                t = __read_index_table_from_scp_file(fileName)
            elif useSuffix == "ark":
                t = __read_index_table_from_ark_file(fileName)
            elif useSuffix == "scp":
                t = __read_index_table_from_scp_file(fileName)
            else:
                raise UnsupportedType(
                    "Unknown file suffix. Specify <useSuffix> please.")

            newTable.update(t)

        return newTable
Exemple #9
0
def __compute_feature(target,kaldiTool,useSuffix=None,name="feat",outFile=None):
	'''
	The base funtion to compute feature.
	'''
	declare.kaldi_existed()

	if useSuffix != None:
		declare.is_valid_string("useSuffix",useSuffix)
		useSuffix = useSuffix.strip().lower()[-3:]
		declare.is_instances("useSuffix",useSuffix,["scp","wav"])
	else:
		useSuffix = ""	

	targets,kaldiTools,useSuffixs,names,outFiles = check_multiple_resources(target,kaldiTool,useSuffix,name,outFile=outFile)
	# pretreatment
	fromSegment = False
	with FileHandleManager() as fhm:

		segments = []
		for index,kaldiTool,target,useSuffix,name in zip(range(len(outFiles)),kaldiTools,targets,useSuffixs,names):
			
			declare.is_classes("target",target,["str","ListTable","WavSegment"])
			declare.is_valid_string("name",name)

			if isinstance(target,str):		
		
				allFiles = list_files(target)
				target = ListTable()

				for filePath in allFiles:
					filePath = filePath.strip()
					if filePath[-4:].lower() == ".wav":
						fileName = os.path.basename(filePath)
						uttID = fileName[0:-4].replace(".","")
						target[uttID] = filePath
					
					elif filePath[-4:].lower() == '.scp':
						target += load_list_table(filePath)
					
					elif "wav" == useSuffix:
						fileName = os.path.basename(filePath)
						uttID = fileName.replace(".","")
						target[uttID] = filePath

					elif "scp" == useSuffix:
						target += load_list_table(filePath)

					else:
						raise UnsupportedType('Unknown file suffix. You can declare whether <useSuffix> is "wav" or "scp".')
				
				if len(target) == 0:
					raise WrongDataFormat("There did not include any data to compute data in target.")

				targets[index] = target
			
			elif type_name(target) == "WavSegment":

				segTemp = fhm.create("w+",suffix=".seg",encode="utf-8")
				target.save(segTemp)
				segments.append(segTemp.name)

				targets[index] = target.detach_wav()
				fromSegment = True

	if fromSegment:
		# define the command pattern
		cmdPattern = "extract-segments scp:{wavFile} {segment} ark:- | {kaldiTool} ark:- ark:{outFile}"
		# define resources
		resources = {"wavFile":targets,"segment":segments,"kaldiTool":kaldiTools,"outFile":outFiles}
	else:
		# define the command pattern
		cmdPattern = "{kaldiTool} scp:{wavFile} ark:{outFile}"
		# define resources
		resources = {"wavFile":targets,"kaldiTool":kaldiTools,"outFile":outFiles}

	# Run
	return run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,generateArchive="feat",archiveNames=names)