Exemple #1
0
def compute_fbank(target,rate=16000,frameWidth=25,frameShift=10,
					melBins=23,windowType='povey',useSuffix=None,
					config=None,name="fbank",outFile=None):
	'''
	Compute fbank feature.
	
	Share Args:
		Null 

	Parallel Args:
		<target>: wave file,scp file,exkaldi ListTable object or WavSegment object. If it is wave file,we will use it's file name as utterance ID.
		<rate>: sample rate.
		<frameWidth>: windows width (ms).
		<frameShift>: shift windows width (ms).
		<melbins>: the numbers of mel filter banks.
		<windowType>: windows type.
		<useSuffix>: If the suffix of file is not .scp or .wav,use this to specify it.
		<config>:  extra optional configurations.
		<name>: the name of output feature.
		<outFile>: output file name.
		
		Some usual options can be assigned directly. If you want use more,set <config> = your-configure.
		You can use exkaldi.check_config('compute_fbank') function to get the reference of extra configurations.
		Also you can run shell command "compute-fbank-feats" to look their usage.

	Return:
		exkaldi feature or index table object.
	'''
	# check the basis configure parameters to build base commands
	stdParameters = check_multiple_resources(rate,frameWidth,frameShift,melBins,windowType,config)

	baseCmds = []
	for rate,frameWidth,frameShift,melBins,windowType,config,_ in zip(*stdParameters):
		declare.is_positive_int("rate",rate)
		declare.is_positive_int("frameWidth",frameWidth)
		declare.is_positive_int("frameShift",frameShift)
		declare.is_positive_int("melBins",melBins)
		declare.greater_equal("frameWidth",frameWidth,"frameShift",frameShift)
		declare.is_instances("windowType",windowType,["hamming","hanning","povey","rectangular","blackmann"])

		kaldiTool = 'compute-fbank-feats --allow-downsample --allow-upsample '
		kaldiTool += f'--sample-frequency={rate} '
		kaldiTool += f'--frame-length={frameWidth} '
		kaldiTool += f'--frame-shift={frameShift} '
		kaldiTool += f'--num-mel-bins={melBins} '
		kaldiTool += f'--window-type={windowType} '

		if config is not None:
			if check_config(name='compute_fbank',config=config):
				for key,value in config.items():
					if isinstance(value,bool):
						if value is True:
							kaldiTool += f"{key} "
					else:
						kaldiTool += f"{key}={value} "
		
		baseCmds.append(kaldiTool)
	
	# run the common function
	return __compute_feature(target,baseCmds,useSuffix,name,outFile)
Exemple #2
0
def load_ali(target, aliType="transitionID", name="ali", hmm=None):
    '''
	Load alignment data.

	Args:
		<target>: Python dict object,bytes object,exkaldi alignment object,kaldi alignment file or .npy file.
		<aliType>: None,or one of 'transitionID','phoneID','pdfID'. It will return different alignment object.
		<name>: a string.
		<hmm>: file path or exkaldi HMM object.

	Return:
		exkaldi alignment objects.
	'''
    declare.is_valid_string("name", name)
    declare.is_instances("aliType", aliType,
                         [None, "transitionID", "phoneID", "pdfID"])
    declare.kaldi_existed()

    def transform(data, cmd):
        out, err, cod = run_shell_command(cmd,
                                          stdin="PIPE",
                                          stdout="PIPE",
                                          stderr="PIPE",
                                          inputs=data)
        if (isinstance(cod, int) and cod != 0) and out == b'':
            raise KaldiProcessError('Failed to transform alignment.',
                                    err.decode())
        else:
            result = {}
            sp = BytesIO(out)
            for line in sp.readlines():
                line = line.decode()
                line = line.strip().split()
                utt = line[0]
                matrix = np.array(line[1:], dtype=np.int32)
                result[utt] = matrix
            return result

    if isinstance(target, dict):
        if aliType is None:
            result = NumpyAli(target, name)
        elif aliType == "transitionID":
            result = NumpyAliTrans(target, name)
        elif aliType == "phoneID":
            result = NumpyAliPhone(target, name)
        elif aliType == "pdfID":
            result = NumpyAliPdf(target, name)
        else:
            raise WrongOperation(
                f"<aliType> should be None,'transitionID','phoneID' or 'pdfID' but got {aliType}."
            )
        result.check_format()
        return result

    elif isinstance(target, (NumpyAli, NumpyAliTrans, BytesAliTrans)):
        result = copy.deepcopy(target)
        result.rename(name)
        return result

    elif isinstance(target, IndexTable):
        result = target.fetch(arkType="ali")
        if aliType in ["phoneID", "pdfID"]:
            result = result.to_numpy(aliType, hmm)
        result.rename(name)
        return result

    elif isinstance(target, str):
        allFiles = list_files(target)
        numpyAli = {}
        bytesAli = []

        for fileName in allFiles:
            fileName = fileName.strip()
            if fileName.endswith(".npy"):
                try:
                    temp = np.load(fileName, allow_pickle=True)
                    numpyAli.update(temp)
                except:
                    raise UnsupportedType(
                        f'This is not a valid Exkaldi npy file: {fileName}.')
            else:
                if fileName.endswith('.gz'):
                    cmd = f'gunzip -c {fileName}'
                else:
                    cmd = f'cat {fileName}'

                if aliType is None or aliType == "transitionID":
                    out, err, cod = run_shell_command(cmd,
                                                      stdout="PIPE",
                                                      stderr="PIPE")
                    if (isinstance(cod, int) and cod != 0) or out == b'':
                        raise ShellProcessError(
                            f"Failed to get the alignment data from file: {fileName}.",
                            err.decode())
                    else:
                        bytesAli.append(out)

                else:
                    with FileHandleManager() as fhm:
                        declare.is_potential_hmm("hmm", hmm)
                        if not isinstance(hmm, str):
                            hmmTemp = fhm.create("wb+")
                            hmm.save(hmmTemp)
                            hmm = hmmTemp.name

                        if aliType == "phoneID":
                            cmd += f" | ali-to-phones --per-frame=true {hmm} ark:- ark,t:-"
                            temp = transform(None, cmd)

                        else:
                            cmd += f" | ali-to-pdf {hmm} ark:- ark,t:-"
                            temp = transform(None, cmd)

                    numpyAli.update(temp)

        bytesAli = b"".join(bytesAli)
        if aliType is None:
            if len(numpyAli) == 0:
                return BytesAliTrans(bytesAli, name=name)
            elif len(bytesAli) == 0:
                return NumpyAli(numpyAli, name=name)
            else:
                result = NumpyAliTrans(numpyAli) + BytesAliTrans(bytesAli)
                result.rename(name)
                return result
        elif aliType == "transitionID":
            if len(numpyAli) == 0:
                return BytesAliTrans(bytesAli, name=name)
            elif len(bytesAli) == 0:
                return NumpyAliTrans(numpyAli, name=name)
            else:
                result = NumpyAliTrans(numpyAli) + BytesAliTrans(bytesAli)
                result.rename(name)
                return result
        elif aliType == "phoneID":
            return NumpyAliPhone(numpyAli, name=name)
        else:
            return NumpyAliPdf(numpyAli, name=name)

    else:
        raise UnsupportedType(
            f"<target> should be dict,file name or exkaldi alignment or index table object but got: {type_name(target)}."
        )
Exemple #3
0
def __read_data_from_file(fileName, useSuffix=None):
    '''
	Read data from file. If the file suffix is unknown,<useSuffix> is necessary.
	'''
    declare.kaldi_existed()

    if useSuffix != None:
        declare.is_valid_string("useSuffix", useSuffix)
        useSuffix = useSuffix.strip().lower()[-3:]
        declare.is_instances("useSuffix", useSuffix, ["ark", "scp", "npy"])
    else:
        useSuffix = ""

    allFiles = list_files(fileName)

    allData_bytes = []
    allData_numpy = {}

    def loadNpyFile(fileName):
        try:
            temp = np.load(fileName, allow_pickle=True)
            data = {}
            for utt_mat in temp:
                assert isinstance(utt_mat[0], str) and isinstance(
                    utt_mat[1], np.ndarray)
                data[utt_mat[0]] = utt_mat[1]
        except:
            raise UnsupportedType(
                f'This is not a valid Exkaldi npy file: {fileName}.')
        else:
            return data

    def loadArkScpFile(fileName, suffix):
        declare.kaldi_existed()

        if suffix == "ark":
            cmd = 'copy-feats ark:'
        else:
            cmd = 'copy-feats scp:'

        cmd += '{} ark:-'.format(fileName)
        out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE")
        if (isinstance(cod, int) and cod != 0) or out == b'':
            raise KaldiProcessError('Failed to read archive table.',
                                    err.decode())
        else:
            #if sys.getsizeof(out) > 10000000000:
            #    print('Warning: Data is extramely large. We don't recommend use load_index_table to replace it.')
            return out

    for fileName in allFiles:
        sfx = fileName.strip()[-3:].lower()
        if sfx == "npy":
            allData_numpy.update(loadNpyFile(fileName))
        elif sfx in ["ark", "scp"]:
            allData_bytes.append(loadArkScpFile(fileName, sfx))
        elif useSuffix == "npy":
            allData_numpy.update(loadNpyFile(fileName))
        elif useSuffix in ["ark", "scp"]:
            allData_bytes.append(loadArkScpFile(fileName, sfx))
        else:
            raise UnsupportedType(
                'Unknown file suffix. You can appoint the <useSuffix> option with "scp","ark" or "npy".'
            )

    allData_bytes = b"".join(allData_bytes)

    if useSuffix == "":
        useSuffix = allFiles[0].strip()[-3:].lower()

    if useSuffix == "npy":
        dataType = "numpy"
    else:
        dataType = "bytes"

    return allData_bytes, allData_numpy, dataType
Exemple #4
0
def load_index_table(target, name="index", useSuffix=None):
    '''
	Load an index table from dict,or archive table file.

	Args:
		<target>: dict object,.ark or .scp file,IndexTable object,bytes archive object.
		<name>: a string.
		<useSuffix>: "ark" or "scp". We will check the file type by its suffix. 
								But if <target> is file path and not default suffix (ark or scp),you have to declare which type it is.

	Return:
		an exkaldi IndexTable object.
	'''
    newTable = IndexTable(name=name)

    if type_name(target) == "dict":
        for key, value in target.items():
            if isinstance(value, (list, tuple)):
                assert len(value) in [
                    3, 4
                ], f"Expected (frames,start index,data size[,file path]) but {value} does not match."
                newTable[key] = newTable.spec(*value)
            elif type_name(value) == "Index":
                newTable[key] = value
            else:
                raise WrongDataFormat(
                    f"Expected list or tuple but got wrong index info format: {value}."
                )

        return newTable

    elif type_name(target) == "IndexTable":
        newTable.update(target)
        return newTable

    elif isinstance(target, BytesArchive):
        newTable.update(target.indexTable)
        return newTable

    else:
        fileList = list_files(target)

        if useSuffix is not None:
            declare.is_valid_string("useSuffix", useSuffix)
            useSuffix = useSuffix.strip()[-3:].lower()
            declare.is_instances("useSuffix", useSuffix, ["ark", "scp"])
        else:
            useSuffix = ""

        for fileName in fileList:

            if fileName.rstrip().endswith(".ark"):
                t = __read_index_table_from_ark_file(fileName)
            elif fileName.rstrip().endswith(".scp"):
                t = __read_index_table_from_scp_file(fileName)
            elif useSuffix == "ark":
                t = __read_index_table_from_ark_file(fileName)
            elif useSuffix == "scp":
                t = __read_index_table_from_scp_file(fileName)
            else:
                raise UnsupportedType(
                    "Unknown file suffix. Specify <useSuffix> please.")

            newTable.update(t)

        return newTable
Exemple #5
0
def __compute_feature(target,kaldiTool,useSuffix=None,name="feat",outFile=None):
	'''
	The base funtion to compute feature.
	'''
	declare.kaldi_existed()

	if useSuffix != None:
		declare.is_valid_string("useSuffix",useSuffix)
		useSuffix = useSuffix.strip().lower()[-3:]
		declare.is_instances("useSuffix",useSuffix,["scp","wav"])
	else:
		useSuffix = ""	

	targets,kaldiTools,useSuffixs,names,outFiles = check_multiple_resources(target,kaldiTool,useSuffix,name,outFile=outFile)
	# pretreatment
	fromSegment = False
	with FileHandleManager() as fhm:

		segments = []
		for index,kaldiTool,target,useSuffix,name in zip(range(len(outFiles)),kaldiTools,targets,useSuffixs,names):
			
			declare.is_classes("target",target,["str","ListTable","WavSegment"])
			declare.is_valid_string("name",name)

			if isinstance(target,str):		
		
				allFiles = list_files(target)
				target = ListTable()

				for filePath in allFiles:
					filePath = filePath.strip()
					if filePath[-4:].lower() == ".wav":
						fileName = os.path.basename(filePath)
						uttID = fileName[0:-4].replace(".","")
						target[uttID] = filePath
					
					elif filePath[-4:].lower() == '.scp':
						target += load_list_table(filePath)
					
					elif "wav" == useSuffix:
						fileName = os.path.basename(filePath)
						uttID = fileName.replace(".","")
						target[uttID] = filePath

					elif "scp" == useSuffix:
						target += load_list_table(filePath)

					else:
						raise UnsupportedType('Unknown file suffix. You can declare whether <useSuffix> is "wav" or "scp".')
				
				if len(target) == 0:
					raise WrongDataFormat("There did not include any data to compute data in target.")

				targets[index] = target
			
			elif type_name(target) == "WavSegment":

				segTemp = fhm.create("w+",suffix=".seg",encode="utf-8")
				target.save(segTemp)
				segments.append(segTemp.name)

				targets[index] = target.detach_wav()
				fromSegment = True

	if fromSegment:
		# define the command pattern
		cmdPattern = "extract-segments scp:{wavFile} {segment} ark:- | {kaldiTool} ark:- ark:{outFile}"
		# define resources
		resources = {"wavFile":targets,"segment":segments,"kaldiTool":kaldiTools,"outFile":outFiles}
	else:
		# define the command pattern
		cmdPattern = "{kaldiTool} scp:{wavFile} ark:{outFile}"
		# define resources
		resources = {"wavFile":targets,"kaldiTool":kaldiTools,"outFile":outFiles}

	# Run
	return run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,generateArchive="feat",archiveNames=names)
Exemple #6
0
def wer(ref, hyp, ignore=None, mode='all'):
    '''
	Compute WER (word error rate) between <ref> and <hyp>. 

	Args:
		<ref>,<hyp>: exkaldi transcription object or file path.
		<ignore>: ignore a symbol.
		<mode>: "all" or "present".

	Return:
		a namedtuple of score information.
	'''
    declare.is_potential_transcription("ref", ref)
    declare.is_potential_transcription("hyp", hyp)
    declare.is_instances("mode", mode, ['all', 'present'])
    declare.kaldi_existed()

    if ignore is not None:
        declare.is_valid_string("ignore", ignore)

    with FileHandleManager() as fhm:

        if ignore is None:

            if type_name(hyp) == "Transcription":
                hypTemp = fhm.create("w+", suffix=".txt", encoding="utf-8")
                hyp.save(hypTemp)
                hyp = hypTemp.name

            if type_name(ref) == "Transcription":
                refTemp = fhm.create("w+", suffix=".txt", encoding="utf-8")
                ref.save(refTemp)
                ref = refTemp.name

            cmd = f'compute-wer --text --mode={mode} ark:{ref} ark,p:{hyp}'
            scoreOut, scoreErr, _ = run_shell_command(cmd,
                                                      stdout="PIPE",
                                                      stderr="PIPE")

        else:
            # remove the ingored symbol in hyp
            if type_name(hyp) == "Transcription":
                hyp = hyp.save()
            else:
                with open(hyp, "r", encoding="utf-8") as fr:
                    hyp = fr.read()
            hypTemp = fhm.create("w+", suffix=".txt", encoding="utf-8")
            cmd = f'sed "s/{ignore} //g" > {hypTemp.name}'
            hypOut, err, _ = run_shell_command(cmd,
                                               stdin="PIPE",
                                               stdout="PIPE",
                                               stderr="PIPE",
                                               inputs=hyp)
            if len(hypOut) == 0:
                raise WrongDataFormat("<hyp> has wrong data formation.",
                                      err.decode())
            # remove the ingored symbol in ref
            if type_name(ref) == "Transcription":
                ref = ref.save()
            else:
                with open(ref, "r", encoding="utf-8") as fr:
                    ref = fr.read()
            refTemp = fhm.create("w+", suffix=".txt", encoding="utf-8")
            cmd = f'sed "s/{ignore} //g" > {refTemp.name}'
            refOut, err, cod = run_shell_command(cmd,
                                                 stdin="PIPE",
                                                 stdout="PIPE",
                                                 stderr="PIPE",
                                                 inputs=ref)
            if cod != 0 or len(refOut) == 0:
                raise WrongDataFormat("<ref> has wrong data formation.",
                                      err.decode())
            # score
            cmd = f'compute-wer --text --mode={mode} ark:{refTemp.name} ark,p:{hypTemp.name}'
            scoreOut, scoreErr, _ = run_shell_command(cmd,
                                                      stdout="PIPE",
                                                      stderr="PIPE")

    if len(scoreOut) == 0:
        raise KaldiProcessError("Failed to compute WER.", scoreErr.decode())
    else:
        out = scoreOut.decode().split("\n")
        pattern1 = '%WER (.*) \[ (.*) \/ (.*),(.*) ins,(.*) del,(.*) sub \]'
        pattern2 = "%SER (.*) \[ (.*) \/ (.*) \]"
        pattern3 = "Scored (.*) sentences,(.*) not present in hyp."
        s1 = re.findall(pattern1, out[0])[0]
        s2 = re.findall(pattern2, out[1])[0]
        s3 = re.findall(pattern3, out[2])[0]

        return namedtuple("Score", [
            "WER", "words", "insErr", "delErr", "subErr", "SER", "sentences",
            "wrongSentences", "missedSentences"
        ])(
            float(s1[0]),  #WER
            int(s1[2]),  #words
            int(s1[3]),  #ins
            int(s1[4]),  #del
            int(s1[5]),  #sub
            float(s2[0]),  #SER
            int(s2[1]),  #sentences
            int(s2[2]),  #wrong sentences
            int(s3[1])  #missed sentences
        )
Exemple #7
0
def edit_distance(ref, hyp, ignore=None, mode='present'):
    '''
	Compute edit-distance score.

	Args:
		<ref>,<hyp>: exkaldi Transcription objects.
		<ignore>: Ignoring specific symbols.
		<mode>: When both are Transcription objects,if mode is 'present',skip the missed utterances.

	Return:
		a namedtuple object including score information.	
	'''
    declare.is_potential_transcription("ref", ref)
    declare.is_potential_transcription("hyp", hyp)
    declare.is_instances("mode", mode, ['all', 'present'])

    if ignore is not None:
        declare.is_valid_string("ignore", ignore)

    if isinstance(ref, str):
        ref = load_transcription(ref)

    if isinstance(hyp, str):
        hyp = load_transcription(hyp)

    allED = 0
    words = 0
    SER = 0
    sentences = 0
    wrongSentences = 0
    missedSentences = 0

    ref = ref.sort()
    hyp = hyp.sort()

    for utt, hypTrans in hyp.items():
        try:
            refTrans = ref[utt]
        except KeyError as e:
            if mode == "all":
                raise Exception(
                    "Missing transcription in reference,set <mode> as 'all' to skip it."
                )
            else:
                missedSentences += 1
        else:
            sentences += 1
            refTrans = refTrans.split()
            hypTrans = hypTrans.split()
            ed, wds = pure_edit_distance(refTrans, hypTrans, ignore=ignore)
            allED += ed
            words += wds
            if ed > 0:
                wrongSentences += 1

    if sentences == 0:
        raise Exception(
            "Missing all transcription in reference. We don't think it's a reasonable result. Check the file please."
        )

    return namedtuple("Score", [
        "editDistance", "words", "SER", "sentences", "wrongSentences",
        "missedSentences"
    ])(allED, words, wrongSentences / sentences, sentences, wrongSentences,
       missedSentences)
Exemple #8
0
def pad_sequence(data,
                 dim=0,
                 maxLength=None,
                 dtype='float32',
                 padding='tail',
                 truncating='tail',
                 value=0.0):
    '''
	Pad sequence.

	Args:
		<data>: a list of NumPy arrays.
		<dim>: which dimmension to pad. All other dimmensions should be the same size.
		<maxLength>: If larger than this theshold,truncate it.
		<dtype>: target dtype.
		<padding>: padding position,"head","tail" or "random".
		<truncating>: truncating position,"head","tail".
		<value>: padding value.
	
	Return:
		a two-tuple: (a Numpy array,a list of padding positions). 
	'''
    declare.is_classes("data", data, (list, tuple))
    declare.is_non_negative_int("dim", dim)
    declare.not_void("data", data)
    declare.is_classes("value", value, (int, float))
    declare.is_instances("padding", padding, ["head", "tail", "random"])
    declare.is_instances("truncating", padding, ["head", "tail"])
    if maxLength is not None:
        declare.is_positive_int("maxLength", maxLength)

    lengths = []
    newData = []
    exRank = None
    exOtherDims = None
    for i in data:

        # verify
        declare.is_classes("data", i, np.ndarray)
        shape = i.shape
        if exRank is None:
            exRank = len(shape)
            assert dim < exRank, f"<dim> is out of range: {dim}>{exRank-1}."
        else:
            assert len(
                shape
            ) == exRank, f"Arrays in <data> has different rank: {exRank}!={len(shape)}."

        if dim != 0:
            # transpose
            rank = [r for r in range(exRank)]
            rank[0] = dim
            rank[dim] = 0
            i = i.transpose(rank)

        if exOtherDims is None:
            exOtherDims = i.shape[1:]
        else:
            assert exOtherDims == i.shape[
                1:], f"Expect for sequential dimmension,All arrays in <data> has same shape but got: {exOtherDims}!={i.shape[1:]}."

        length = len(i)
        if maxLength is not None and length > maxLength:
            if truncating == "head":
                i = i[maxLength:, ...]
            else:
                i = i[0:maxLength:, ...]

        lengths.append(len(i))
        newData.append(i)

    maxLength = max(lengths)
    batchSize = len(newData)

    result = np.array(value, dtype=dtype) * np.ones(
        [batchSize, maxLength, *exOtherDims], dtype=dtype)

    pos = []
    for i in range(batchSize):
        length = lengths[i]
        if padding == "tail":
            result[i][0:length] = newData[i]
            pos.append((0, length))
        elif padding == "head":
            start = maxLength - length
            result[i][start:] = newData[i]
            pos.append((start, maxLength))
        else:
            start = random.randint(0, maxLength - length)
            end = start + length
            result[i][start:end] = newData[i]
            pos.append((start, end))

    if dim != 0:
        exRank = len(result.shape)
        rank = [r for r in range(exRank)]
        rank[1] = dim + 1
        rank[dim + 1] = 1
        result = result.transpose(rank)

    return result, pos
Exemple #9
0
	def add(self,name,dtype,abbr=None,default=None,choices=None,minV=None,maxV=None,discription=None):
		'''
		Add a new option.

		Args:
			_name_: a string which must have a format such as "--exkaldi" (but "--help" is inavaliable exceptionally.).  
			_dtype_: float, int, str or bool.  
			_abbr_: None or a abbreviation of name which must have a format such as "-e" (but "-h" is inavaliable exceptionally.).  
			_dtype_: the default value or a list/tuple of values.  
			_choices_: a list/tuple of values.  
			_minV_: set the minimum value if dtype is int or float. Enable when _choices_ is None.  
			_maxV_: set the maximum value if dtype is int or float. Enable when _choices_ is None.  
			_maxV_: a string to discribe this option.
		'''
		self.__capture()

		# check option name
		declare.is_valid_string("name",name)
		name = name.strip()
		self.__detect_special_char(name)
		assert name[0:2] == "--" and name[2:3] != "-", f"Option name must start with '--' but got: {name}."
		assert name != "--help", "Option name is inavaliable: --help."
		if name in self.__arguments.keys():
			raise WrongOperation(f"Option name has existed: {name}.")
		
		# check dtype
		declare.is_instances("option dtype", dtype, (float,int,bool,str))

		# check abbreviation
		if abbr is not None:
			declare.is_valid_string("abbr",abbr)
			abbr = abbr.strip()
			self.__detect_special_char(abbr)
			assert abbr[0:1] == "-" and abbr[1:2] != "-", f"Abbreviation must start with '-' but got: {abbr}."
			assert abbr != "-h", "Abbreviation is inavaliable: -h."
			if abbr in self.__abb2Name.keys():
				raise WrongOperation(f"Abbreviation has existed: {abbr}.")

		# check default value
		if default is not None:
			if isinstance(default,(list,tuple)):
				declare.members_are_classes(f"Default value of {name}", default, dtype)
			else:
				declare.is_classes(f"Default value of {name}", default, dtype)
			if dtype == str:
				self.__detect_special_char(default)

		# check choices
		if choices is not None:
			declare.is_classes(f"Choices of {name}", choices, (list,tuple))
			declare.members_are_classes(f"Choices of {name}", choices, dtype)
			if dtype == str:
				self.__detect_special_char(choices)
			if default is not None:
				if isinstance(default,(list,tuple)):
					declare.members_are_instances(f"Default value of {name}", default, choices)
				else:
					declare.is_instances(f"Default value of {name}", default, choices)
		
		# check boundary values
		if minV is not None or maxV is not None:
			assert dtype in [float,int], f"Only float and int option can set the boundary but {name} is {dtype.__name__}."
			assert choices is None, f"Cannot set choices and boundary concurrently: {name}."
			if minV is not None:
				declare.is_classes(f"Minimum value of {name}", minV, dtype)
				if default is not None:
					if isinstance(default, (list,tuple)):
						for v in default:
							declare.greater_equal(f"Default value of {name}", v, "minimum expected value", minV)
					else:
						declare.greater_equal(f"Default of {name}", default, "minimum expected value", minV)
			if maxV is not None:
				declare.is_classes(f"Maximum value of {name}", maxV, dtype)
				if default is not None:
					if isinstance(default,(list,tuple)):
						for v in default:					
							declare.less_equal(f"Default value of {name}", v, "maximum expected value", maxV)
					else:
						declare.less_equal(f"Default value of {name}", default, "maximum expected value", maxV)
			if minV is not None and maxV is not None:
				declare.less_equal(f"Minimum value of {name}", minV, f"maximum value", maxV)

		# check discription
		if discription is not None:
			declare.is_valid_string(f"Discription of {name}", discription)
			self.__detect_special_char(discription)

		self.__arguments[name] = self.spec(dtype,default,choices,minV,maxV,discription)
		self.__name2Abb[name] = abbr
		if abbr is not None:
			self.__abb2Name[abbr] = name
Exemple #10
0
	def load(self, filePath):
		'''
		Load auguments from file.

		Args:
			_filePath_: args file path.
		'''
		declare.is_file("filePath", filePath)
		self.reset()

		with open(filePath, "r", encoding="utf-8") as fr:
			lines = fr.read()
		lines = lines.strip()
		if len(lines) == 0:
			raise WrongOperation(f"This is a void file: {filePath}.")
		
		blocks = lines.split("\n\n")
		
		def __parse(name, value, dtype):
			if dtype in [float,int]:
				try:
					value = dtype(value)
				except ValueError:
					raise WrongOperation(f"Option <{name}> need a {dtype.__name__} value but choices got: {value}.")
			elif dtype == bool:
				if value.lower() == "true":
					value = True
				elif c.lower() == "false":
					value = False
				else:
					raise WrongOperation(f"Option <{name}> need a bool value but choices got: {value}.")

			return value  

		self.__discription = blocks[0].strip()
		for blockNo, block in enumerate(blocks[1:], start=1):
			block = block.strip()
			if len(block) == 0:
				continue
			block = block.split("\n")
			# 1. match options
			values = {"name":None,"abbr":None,"dtype":None,"default":None,"choices":None,"minV":None,"maxV":None,"discription":None,"value":None}
			for m in block:
				m = m.strip()
				assert "=" in m, f"Augument should has format: key = value, but got: {m}."
				assert len(m.split("=")) == 2, f"Augument should has format: key = value, but got: {m}."
				m = m.split("=")
				name = m[0].strip()
				value = m[1].strip()
				declare.is_instances("Option key", name, list(values.keys()))
				values[name] = value

			for key, value in values.items():
				assert value is not None, f"Missed {key} information in line: {lineNo}."
			# 2. parse
			name = values["name"]
			# parse the dtype firstly
			declare.is_instances("dtype", values["dtype"], ["float","int","bool","str"])
			values["dtype"] = eval(values["dtype"])
			dtype = values["dtype"]	
			# then parse the choices
			choices = values["choices"]
			if choices in ["none", "None"]:
				choices = None
			else:
				choices = choices.split("|")
				for i, c in enumerate(choices):
					choices[i] = __parse(name, c, dtype)
			values["choices"] = choices
			# then parse the boundary value
			boundary = {"minV":None, "maxV":None}
			for i in boundary.keys():
				V = values[i]
				if V not in ["none", "None"]:
					assert dtype in [float,int], f"Only float and int option can set the boundary but {name} is {dtype.__name__}:"
					assert choices is None, f"{name} cannot set choices and boundary concurrently."
					
					toIntFlag = True
					toFloatFlag = True
					try:
						float(V)
					except ValueError:
						toFloatFlag= False
					try:
						int(V)
					except ValueError:
						toIntFlag= False
					
					if toIntFlag is False and toFloatFlag is False:
						raise WrongDataFormat(f"Boundary values of {name} should be a int or float value but got: {V}.")
					elif toIntFlag is False and toFloatFlag is True: # minV is predicted be a float value
						if dtype != float:
							raise WrongDataFormat(f"{name}'s dtype is int but try to set boundary value with a float value: {V}.")
						else:
							V = float(V)
					elif toIntFlag is True and toFloatFlag is True: # minV is predicted be a float or an int value
						V = dtype(V)
					else:
						raise WrongDataFormat(f"Failed to set {name}'s boundary value: {V}.")
				
					boundary[i] = V
			values["minV"] = boundary["minV"]
			values["maxV"] = boundary["maxV"]
			# then parse the default and value
			if values["default"].lower() == "none":
				values["default"] = None
			else:
				default = values["default"].split("|")
				for i, v in enumerate(default):
					default[i] = __parse(name, v, dtype)
				values["default"] = default if len(default) > 1 else default[0]
			
			# the judgement of "default" will be done by .parse() function, so here we only verify "value"
			if values["value"].lower() == "none":
				values["value"] = None
			else:
				value = values["value"].split("|")
				for i, v in enumerate(value):
					v = __parse(name, v, dtype)
					if values["choices"] is not None:
						declare.is_instances("Option value", v, values["choices"])
					else:
						if values["minV"] is not None:
							declare.greater_equal("Option value", v, "minimum expected value", values["minV"])
						if values["maxV"] is not None:
							declare.less_equal("Option value", v, "maximum expected value", values["maxV"])
					value[i] = v
				if len(value) == 1:
					value = value[0]
				values["value"] = value
			
			# check abbreviation
			if values["abbr"] in ["none", "None"]:
				values["abbr"] = None

			# add this options
			self.add(name=values["name"], 
							 dtype=values["dtype"], 
							 abbr=values["abbr"], 
							 default=values["default"], 
					 		 choices=values["choices"], 
							 minV=values["minV"], 
							 maxV=values["maxV"], 
							 discription=values["discription"]
							)
			
			# finally, modify the "value"
			self.__arguments[values["name"]] = self.__arguments[values["name"]]._replace(value=values["value"])
			if values["value"] is not None:
				self.__setattr__(values["name"], values["value"])
Exemple #11
0
def run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,timeout=ExKaldiInfo.timeout,generateArchive=None,archiveNames=None):
	'''
	Map resources to command pattern and run this command parallelly.

	Args:
		<resources>: a dict whose keys are the name of resource and values are lists of resources objects.
					For example: {"feat": [BytesFeat01,BytesFeat02,... ],"outFile":{"newFeat01.ark","newFeat02.ark",...} }.
					The "outFile" resource is necessary.
					When there is only one process to run,"outFile" can be "-" which means the standard output stream.

		<cmdPattern>: a string needed to map the resources.
					For example: "copy-feat {feat} ark:{outFile}".
	
	Return:
		a list of triples: (return code,error info,output file or buffer)
	'''
	declare.kaldi_existed()
	declare.is_classes("resources",resources,dict)
	declare.is_classes("cmdPattern",cmdPattern,str)
	assert "outFile" in resources.keys(),"<outFile> key and value is necessary in recources."

	declare.members_are_classes("the values of resources",resources.values(),[list,tuple])
	if generateArchive is not None:
		analyzeResult = True #forcely analyze the result

	# check the format of cmomand pattern
	nameIndexs = [ i for i,c in enumerate(cmdPattern) if c == "{" or c == "}" ]
	assert len(nameIndexs)%2 == 0,f"The numbers of braces do not match in command pattern: '{cmdPattern}'. "
	auxiliaryInfo = {}
	for i in range(0,len(nameIndexs),2):
		name = cmdPattern[nameIndexs[i]+1:nameIndexs[i+1]]
		if name not in resources:
			raise WrongDataFormat(f"Resource is necessary but has not been provided: {name}.")
		prefix = "" if nameIndexs[i] == 0 else cmdPattern[nameIndexs[i]-1]
		if name in auxiliaryInfo.keys():
			auxiliaryInfo[name][0] += 1
			if not prefix in auxiliaryInfo[name][1]:
				auxiliaryInfo[name][1] += prefix
		else:
			auxiliaryInfo[name] = [1,prefix]

	assert "outFile" in auxiliaryInfo.keys(),"Key: <outFile> is necessary in command pattern."
	_outFileCountInfo = auxiliaryInfo.pop("outFile")
	assert _outFileCountInfo[0] == 1,f"Only allow <outFile> appear one time in command pattern but: {_outFileCountInfo[0]}."
	outFiles = resources.pop("outFile")

	for outFile in outFiles:
		if outFile != "-":
			make_dependent_dirs(outFile,pathIsFile=True)
	parallel = len(outFiles)

	if generateArchive is not None:
		declare.is_instances("generateArchive",generateArchive,["feat","cmvn","ali","fmllr"])
		if archiveNames is None:
			archiveNames = [ generateArchive for i in range(parallel)]
		elif isinstance(archiveNames,str):
			archiveNames = [ archiveNames for i in range(parallel)]
		elif isinstance(archiveNames,(list,tuple)):
			declare.equal("the number of achieve names",len(archiveNames),"parallel",parallel)
		else:
			raise UnsupportedType(f"<archiveNames> should be string or list or tuple but got: {type_name(archiveNames)}.")

	# regulate resources and run
	with FileHandleManager() as fhm:

		newResources = {}
		if parallel == 1:
			# Detect whether there is PIPE in command pattern.
			testPlaceholder = dict( (key,value[0]) if isinstance(value[0],str) else (key,"placeholder") for key,value in resources.items() )
			testPlaceholder["outFile"] = "placeholder"
			testCmd = cmdPattern.format(**testPlaceholder)
			if "|" in testCmd:
				inputsBuffer = False
			else:
				inputsBuffer = True
			del testPlaceholder
			# regularate resources
			for key,countPrefix in auxiliaryInfo.items():
				count,prefix = countPrefix
				target = resources[key][0]

				# If target is a list-table,we can not automatically decide whether it is scp-format or ark-format.
				# So you should appoint it in the command parttern.
				if type_name(target) in ["ListTable","Transcription"]:
					if prefix not in [":","="]:
						errMes = f"There might miss prefix such as 'ark:' or 'scp:' or '--option=' in command pattern before resource: {key}."
						errMes += "Check the command line please. If you still think there dose not need the prefix,"
						errMes += "save this ListTable or Transcription into file and instead it will this file name."
						errMes += "In that case,we will skip checking the prefix."
						raise WrongOperation(errMes)

					target = target.sort()
					if (inputsBuffer is True) and count == 1:
						inputsBuffer = target.save()
						newResources[key] = "-"
					else:
						targetTemp = fhm.create("w+",encoding="utf-8")
						target.save(targetTemp)
						newResources[key] = f"{targetTemp.name}"

				# If target is an index-table,we automatically recognize it as scp-file,so you do not need appoint it.
				elif type_name(target) == "IndexTable":
					if prefix != " ":
						errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}."
						errMes += f"Because we will decide the prefix depending on its data type."
						raise WrongOperation(errMes)
						
					target = target.sort()
					if (inputsBuffer is True) and count == 1:
						inputsBuffer = target.save()
						newResources[key] = "scp:-"
					else:
						targetTemp = fhm.create("w+",suffix=".scp",encoding="utf-8")
						target.save(targetTemp)
						newResources[key] = f"scp:{targetTemp.name}"
				
				elif isinstance(target,(str,int,float)):
					# file or other value parameter
					newResources[key] = f"{target}"
			
				elif isinstance(target,(BytesMatrix,BytesVector)):
					if prefix != " ":
						errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}."
						errMes += f"Because we will decide the prefix depending on its data type."						
						raise WrongOperation(errMes)

					target = target.sort()
					if (inputsBuffer is True) and count == 1:
						inputsBuffer = target.data
						newResources[key] = "ark:-"
					else:					
						targetTemp = fhm.create("wb+",suffix=".ark")
						target.save(targetTemp)
						newResources[key] = f"ark:{targetTemp.name}"		

				elif isinstance(target,(NumpyMatrix,NumpyVector)):
					if prefix != " ":
						errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}."
						errMes += f"Because we will decide the prefix depending on its data type."		
						raise WrongOperation(errMes)

					target = target.sort()
					if (inputsBuffer is True) and count == 1:
						inputsBuffer = target.to_bytes().data
						newResources[key] = "ark:-"
					else:
						target = target.to_bytes()
						targetTemp = fhm.create("wb+",suffix=".ark")
						target.save(targetTemp)
						newResources[key] = f"ark:{targetTemp.name}"	

				elif isinstance(target,BytesArchive):
					if (inputsBuffer is True) and count == 1:
						inputsBuffer = target.data
						newResources[key] = "-"
					else:
						targetTemp = fhm.create("wb+")
						target.save(targetTemp)
						newResources[key] = f"{targetTemp.name}"

				else:
					raise UnsupportedType(f"<target> should be IndexTable,ListTable,file name,int or float value,or exkaldi achieve object but got: {type_name(target)}.")
			
			# Then,process output stream
			outFile = outFiles[0]
			newResources["outFile"] = outFile
			inputsBuffer = None if isinstance(inputsBuffer,bool) else inputsBuffer
			# Then rum command
			finalCmd = cmdPattern.format(**newResources)
			out,err,cod = run_shell_command(finalCmd,stdin="PIPE",stdout="PIPE",stderr="PIPE",inputs=inputsBuffer)
			
			if analyzeResult:
				if cod != 0:
					finalCmd = ",".join([cmd.strip().split(maxsplit=1)[0] for cmd in finalCmd.split("|")])
					raise KaldiProcessError(f"Failed to run Kaldi command: {finalCmd}.",err.decode())
			
			if outFile == "-":
				if generateArchive is not None:
					if generateArchive == "feat":
						out = BytesFeat(data=out,name=archiveNames[0])
					elif generateArchive == "ali":
						out = BytesAliTrans(data=out,name=archiveNames[0])
					elif generateArchive == "cmvn":
						out = BytesCMVN(data=out,name=archiveNames[0])
					else:
						out = BytesFmllr(data=out,name=archiveNames[0])
					return out
				else:
					return (cod,err,out)
			else:
				if generateArchive is not None:
					return load_index_table(outFile,name=archiveNames[0],useSuffix="ark")
				else:
					return (cod,err,outFile)

		else:
			# In this case,all input IO stream must be files.
			for key,countPrefix in auxiliaryInfo.items():
				count,prefix = countPrefix
				values = resources[key]
				newValues = []
				for target in values:

					# If target is scp resource
					if type_name(target) in ["ListTable","Transcription"]:
						if prefix not in [":","="]:
							errMes = f"There might miss prefix such as 'ark:' or 'scp:' or '--option=' in command pattern before resource: {key}."
							errMes += "Check the command line please. If you still think there dose not need the prefix,"
							errMes += "save this ListTable or Transcription into file and instead it will this file name."
							errMes += "In that case,we will skip checking the prefix."
							raise WrongOperation(errMes)		

						target = target.sort()
						targetTemp = fhm.create("w+",encoding="utf-8")
						target.save(targetTemp)
						newValues.append(f"{targetTemp.name}")						

					elif type_name(target) == "IndexTable":
						if prefix != " ":
							errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}."
							errMes += f"Because we will decide the prefix depending on its data type."
							raise WrongOperation(errMes)		

						target = target.sort()
						targetTemp = fhm.create("w+",suffix=".scp",encoding="utf-8")
						target.save(targetTemp)
						newValues.append(f"scp:{targetTemp.name}")
				
					elif isinstance(target,(str,float,int)):
						# file name or other value parameters
						newValues.append(f"{target}")
				
					elif isinstance(target,(BytesMatrix,BytesVector)):
						if prefix != " ":
							errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}."
							errMes += f"Because we will decide the prefix depending on its data type."						
							raise WrongOperation(errMes)	

						target = target.sort()
						targetTemp = fhm.create("wb+",suffix=".ark")
						target.save(targetTemp)
						newValues.append(f"ark:{targetTemp.name}")			

					elif isinstance(target,(NumpyMatrix,NumpyVector)):
						if prefix != " ":
							errMes = f"Do not need prefix such as 'ark:' or 'scp:' in command pattern before: {key}."
							errMes += f"Because we will decide the prefix depending on its data type."						
							raise WrongOperation(errMes)

						target = target.sort().to_bytes()
						targetTemp = fhm.create("wb+",suffix=".ark")
						target.save(targetTemp)
						newValues.append(f"ark:{targetTemp.name}")

					elif isinstance(target,BytesArchive):
						targetTemp = fhm.create("wb+")
						target.save(targetTemp)	
						newValues.append(f"{targetTemp.name}")

					else:
						raise UnsupportedType(f"<target> should be IndexTable,ListTable,Transcription,file,int or float values or exkaldi achieve object but got: {type_name(target)}.")
				
				newResources[key] = newValues
			
			newResources["outFile"] = outFiles
			# assign these resources to each process and generate multiple commands
			parallelResources = []
			for i in range(parallel):
				parallelResources.append({})
				for key,items in newResources.items():
					parallelResources[-1][key] = items[i]
			cmds = [ cmdPattern.format(**re) for re in parallelResources ]
			# run
			flags = run_shell_command_parallel(cmds,timeout=timeout)

			finalResult = []
			done = True
			for index,info in enumerate(flags):
				cod,err = info
				if analyzeResult and cod != 0:
					print(f"{index}/{len(flags)} error tracking")
					print(err.decode())
					done = False	
				finalResult.append( (cod,err,outFiles[index]) )

			if analyzeResult and (not done):
				finalCmd = ",".join([cmd.strip().split(maxsplit=1)[0] for cmd in cmds[0].split("|")])
				raise KaldiProcessError(f"Failed to run Kaldi command: {finalCmd}. Look the error messages above.")
			else:
				if generateArchive is not None:
					for i,fileName in enumerate(outFiles):
						finalResult[i] = load_index_table(fileName,name=archiveNames[i],useSuffix="ark")

			return finalResult