Beispiel #1
0
def use_cmvn_sliding(feat,windowSize=None,std=False):
	'''
	Allpy sliding CMVN statistics.

	Args:
		<feat>: exkaldi feature object.
		<windowSize>: windows size,If None,use windows size greater_equal than the frames of feature.
		<std>: a bool value.
	
	Return:
		exkaldi feature object.
	'''
	declare.is_classes("feat",feat, ["BytesFeature","NumpyFeature"])
	declare.is_bool("std",std)

	if windowSize is None:
		featLen = feat.lens[1]
		maxLen = max([length for utt,length in featLen])
		windowSize = math.ceil(maxLen/100)*100
	else:
		declare.is_positive_int("windowSize",windowSize)

	if std:
		std='true'
	else:
		std='false'

	cmd = f'apply-cmvn-sliding --cmn-window={windowSize} --min-cmn-window=100 --norm-vars={std} ark:- ark:-'
	out,err,cod = run_shell_command(cmd,stdin="PIPE",stderr="PIPE",stdout="PIPE",inputs=feat.data)
	if cod != 0:
		print(err.decode())
		raise KaldiProcessError("Failed to compute sliding cmvn.")
	
	newName = f"cmvn({feat.name},{windowSize})"
	return BytesFeature(out,name=newName,indexTable=None)
Beispiel #2
0
def make_dependent_dirs(path, pathIsFile=True):
    '''
	Make the dependent directories for a path if it has not existed.

	Args:
		<path>: a file path or folder path.
		<pathIsFile>: a bool value to declare that <path> is a file path or folder path.
	'''
    declare.is_valid_string("path", path)
    declare.is_bool("pathIsFile", pathIsFile)

    path = os.path.abspath(path.strip())

    if pathIsFile:
        if os.path.isdir(path):
            raise WrongPath(
                f"<path> is specified as file but it has existed as directory: {path}. You can remove it then try again."
            )
        else:
            dirPath = os.path.dirname(path)
    else:
        if os.path.isfile(path):
            raise WrongPath(
                f"<path> is specified as directory but it has existed as file: {path}. You can remove it then try again."
            )
        else:
            dirPath = path

    if not os.path.isdir(dirPath):
        try:
            os.makedirs(dirPath)
        except Exception as e:
            print(f"Failed to make directory: {dirPath}.")
            raise e
Beispiel #3
0
def load_transcription(target, name="transcription", checkSpace=True):
    '''
	Load transcription from file.

	Args:
		<target>: transcription file path.
		<name>: a string.
		<checkSpace>: a bbol value. If True,we will check the validity of the number of spaces.

	Return:
		An exkaldi Transcription object.
	'''
    declare.is_classes("target", target,
                       ["dict", "Transcription", "ListTable", "str"])
    declare.is_bool("checkSpace", checkSpace)

    if isinstance(target, str):
        declare.is_file("target", target)
        with open(target, "r", encoding="utf-8") as fr:
            lines = fr.readlines()
        result = Transcription(name=name)
        for index, line in enumerate(lines, start=1):
            t = line.strip().split(maxsplit=1)
            if len(t) < 2:
                print(f"Line Number: {index}")
                print(f"Line Content: {line}")
                raise WrongDataFormat(
                    "Missing entire key and value information.")
            else:
                result[t[0]] = t[1]
    else:
        for utt, utterance in target.items():
            declare.is_valid_string("utterance ID", utt)
            declare.is_valid_string("utterance", utterance)
        result = Transcription(target, name=name)

    if checkSpace:
        sampleText = result.subset(nRandom=100)
        spaceCount = 0
        for key, value in sampleText.items():
            spaceCount += value.count(" ")
        if spaceCount < len(sampleText) // 2:
            errMes = "The transcription doesn't seem to be separated by spaces or extremely short."
            errMes += "If it actually has right format, set the <checkSpace>=False and run this function again."
            raise WrongDataFormat(errMes)

    return result
Beispiel #4
0
    def full_scores_sentence(self, sentence, bos=True, eos=True):
        '''
		Generate full scores (prob, ngram length, oov).

		Args:
			<sentence>: a string with out boundary symbols.
			<bos>: If True, add <s> to the head.
			<eos>: If True, add </s> to the tail.

		Return:
			a iterator of (prob, ngram length, oov).
		'''
        declare.is_valid_string("sentence", sentence)
        declare.is_bool("bos", bos)
        declare.is_bool("eos", eos)

        return self.__model.full_scores(sentence, bos, eos)
Beispiel #5
0
    def score_sentence(self, sentence, bos=True, eos=True):
        '''
		Score a sentence.

		Args:
			<sentence>: a string with out boundary symbols.
			<bos>: If True, add <s> to the head.
			<eos>: If True, add </s> to the tail.

		Return:
			a float value.
		'''
        declare.is_valid_string("sentence", sentence)
        declare.is_bool("bos", bos)
        declare.is_bool("eos", eos)

        return self.__model.score(sentence, bos, eos)
Beispiel #6
0
def compress_gz_file(filePath, overWrite=False, keepSource=False):
    '''
	Compress a file to gz file.

	Args:
		<filePath>: file path.
		<overWrite>: If True,overwrite gz file when it has existed.
		<keepSource>: If True,retain source file.
	
	Return:
		the path of compressed file.
	'''
    declare.is_file("filePath", filePath)
    declare.is_bool("overWrite", overWrite)
    declare.is_bool("keepSource", keepSource)

    filePath = os.path.abspath(filePath)
    if filePath.endswith(".gz"):
        raise WrongOperation(f"Cannot compress a .gz file:{filePath}.")
    else:
        outFile = filePath + ".gz"

    if os.path.isfile(outFile):
        if overWrite is True:
            os.remove(outFile)
        else:
            raise WrongOperation(
                f"File has existed:{outFile}. If overwrite it,set option <overWrite>=True."
            )

    if keepSource:
        cmd = f"gzip -k {filePath}"
    else:
        cmd = f"gzip {filePath}"

    out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

    if cod != 0:
        print(err.decode())
        raise ShellProcessError("Failed to compress file.")
    else:
        return outFile
Beispiel #7
0
def use_cmvn(feat,cmvn,utt2spk=None,std=False,outFile=None):
	'''
	Apply CMVN statistics to feature.

	Share Args:
		Null

	Parrallel Args:
		<feat>: exkaldi feature or index table object.
		<cmvn>: exkaldi CMVN statistics or index object.
		<utt2spk>: file path or ListTable object.
		<std>: If true,apply std normalization.
		<outFile>: out file name.

	Return:
		feature or index table object.
	'''
	feats,cmvns,utt2spks,stds,outFiles = check_multiple_resources(feat,cmvn,utt2spk,std,outFile=outFile)

	names = []
	for i,feat,cmvn,utt2spk,std in zip(range(len(outFiles)),feats,cmvns,utt2spks,stds):
		# verify feature and cmvn
		declare.is_feature("feat",feat)
		declare.is_cmvn("cmvn",cmvn)
		# verify utt2spk
		if utt2spk is not None:
			declare.is_potential_list_table("utt2spk",utt2spk)
		# std
		declare.is_bool("std",std)
		#stds[i] = "true" if std else "false"
		names.append( f"cmvn({feat.name},{cmvn.name})" ) 

	if utt2spks[0] is None:
		cmdPattern = 'apply-cmvn --norm-vars={std} {cmvn} {feat} ark:{outFile}'
		resources = {"feat":feats,"cmvn":cmvns,"std":stds,"outFile":outFiles}
	else:
		cmdPattern = 'apply-cmvn --norm-vars={std} --utt2spk=ark:{utt2spk} {cmvn} {feat} ark:{outFile}'
		resources = {"feat":feats,"cmvn":cmvns,"utt2spk":utt2spks,"std":stds,"outFile":outFiles}	
	
	return run_kaldi_commands_parallel(resources,cmdPattern,analyzeResult=True,generateArchive="feat",archiveNames=names)
Beispiel #8
0
    def __init__(self,
                 indexTable,
                 processFunc,
                 batchSize,
                 chunks='auto',
                 otherArgs=None,
                 shuffle=False,
                 retainData=0.0):

        declare.is_index_table("indexTable", indexTable)
        declare.is_callable("processFunc", processFunc)
        declare.is_positive_int("batchSize", batchSize)
        declare.is_bool("shuffle", shuffle)
        declare.in_boundary("retainData", retainData, minV=0.0, maxV=0.9)

        self.processFunc = processFunc
        self._batchSize = batchSize
        self.otherArgs = otherArgs
        self._shuffle = shuffle
        self._chunks = chunks

        if chunks != 'auto':
            declare.is_positive_int("chunks", chunks)

        totalDataNumber = len(indexTable)
        trainDataNumber = int(totalDataNumber * (1 - retainData))
        evalDataNumber = totalDataNumber - trainDataNumber
        scpTable = indexTable.shuffle()

        self.trainTable = scpTable.subset(nHead=trainDataNumber)
        self.evalTable = scpTable.subset(nTail=evalDataNumber)

        if chunks == 'auto':
            #Compute the chunks automatically
            sampleTable = self.trainTable.subset(nHead=10)
            meanSize = sum(
                [indexInfo.dataSize
                 for indexInfo in sampleTable.values()]) / 10
            autoChunkSize = math.ceil(
                104857600 / meanSize)  # 100MB = 102400KB = 104857600 B
            self._chunks = trainDataNumber // autoChunkSize
            if self._chunks == 0:
                self._chunks = 1

        self.make_dataset_bag(shuffle=False)
        self._epoch = 0

        self.load_dataset(0)
        self.currentDataset = self.nextDataset
        self.nextDataset = None

        self.epochSize = len(self.currentDataset)
        self.countEpochSizeFlag = True

        self.currentPosition = 0
        self.currentEpochPosition = 0
        self._isNewEpoch = False
        self._isNewChunk = False
        self.datasetIndex = 0

        if self._chunks > 1:
            self.datasetIndex = 1
            self.loadDatasetThread = threading.Thread(target=self.load_dataset,
                                                      args=(1, ))
            self.loadDatasetThread.start()
Beispiel #9
0
    def __init__(self,
                 indexTable,
                 processFunc,
                 batchSize,
                 chunks='auto',
                 otherArgs=None,
                 shuffle=False,
                 retainData=0.0):
        '''
		Args:
			_indexTable_: an ExKaldi IndexTable object whose <filePath> info is necessary.
			_processFunc_: a function receive a IndexTable object return return an iterable dataset object.
										It at least need two arguments to receive ( the data iteator itself, a IndexTable object of a chunk data ).
			_batchSize_: mini batch size.
			_chunks_: how many chunks to split.
			_otherArgs_: other arguments to send to <processFunc>.
			_shuffle_: If True, shuffle a batch data.
			_retainData_: a probability value. how much data to retained for evaluation.
		'''
        declare.is_index_table("indexTable", indexTable)
        declare.is_callable("processFunc", processFunc)
        declare.is_positive_int("batchSize", batchSize)
        declare.is_bool("shuffle", shuffle)
        declare.in_boundary("retainData", retainData, minV=0.0, maxV=0.9)

        self.__processFunc = processFunc
        self.__batchSize = batchSize
        self.__otherArgs = otherArgs
        self.__shuffle = shuffle
        self.__chunks = chunks

        if chunks != 'auto':
            declare.is_positive_int("chunks", chunks)

        totalDataNumber = len(indexTable)
        trainDataNumber = int(totalDataNumber * (1 - retainData))
        evalDataNumber = totalDataNumber - trainDataNumber
        scpTable = indexTable.shuffle()

        self.__trainTable = scpTable.subset(nHead=trainDataNumber)
        if evalDataNumber > 0:
            self.__evalTable = scpTable.subset(nTail=evalDataNumber)
        else:
            self.__evalTable = None

        if chunks == 'auto':
            #Compute the chunks automatically
            sampleTable = self.__trainTable.subset(nHead=10)
            meanSize = sum(
                [indexInfo.dataSize
                 for indexInfo in sampleTable.values()]) / 10
            autoChunkSize = math.ceil(
                104857600 / meanSize)  # 100MB = 102400KB = 104857600 B
            self.__chunks = trainDataNumber // autoChunkSize
            if self.__chunks == 0:
                self.__chunks = 1

        # split train dataset into N chunks
        self.__make_dataset_bag(shuffle=False)

        # initialize some parameters
        self.__epoch = 0
        self.__currentPosition = 0
        self.__currentEpochPosition = 0
        self.__isNewEpoch = False
        self.__isNewChunk = False
        self.__datasetIndex = 0

        # load the first chunk data
        self.__load_dataset(0)
        self.__currentDataset = self.__nextDataset
        self.__nextDataset = None

        # accumulate counts
        self.__epochSize = len(self.__currentDataset)
        self.__countEpochSizeFlag = True

        # try to load the next chunk
        if self.__chunks > 1:
            self.__datasetIndex = 1
            self.__loadDatasetThread = threading.Thread(
                target=self.__load_dataset, args=(1, ))
            self.__loadDatasetThread.start()
Beispiel #10
0
def tuple_dataset(archives,frameLevel=False):
	'''
	Tuple feature or alignment archives in "utterance" level or "frame" level.

	Args:
		<archives>: exkaldi feature or alignment objects.
		<framelevel>: If True,tuple data in frame level. Or in utterance level.

	Return:
		List of tupled data.
	'''
	declare.is_classes("archives",archives,(tuple,list))
	assert len(archives) > 1,"<archives> should has multiple items."
	declare.is_bool("frameLevel",frameLevel)
	
	archives = match_utterances(archives)

	fields = {}
	for index,ark in enumerate(archives):
		if frameLevel is True:
			declare.belong_classes("achieves",ark,(BytesMatrix,BytesVector,NumpyMatrix,NumpyVector))
		else:
			declare.belong_classes("achieves",ark,(BytesMatrix,BytesVector,NumpyMatrix,NumpyVector,ListTable))
		
		if isinstance(ark,(BytesMatrix,BytesVector)):
			ark = ark.to_numpy()

		if ark.name not in fields.keys():
			fields[ark.name] = []
		fields[ark.name].append(ark)

	fieldNames = list(fields.keys())

	try:
		if frameLevel:
			templet = namedtuple(typename="TupledData",field_names=["key","frameID",]+fieldNames)
		else:
			templet = namedtuple(typename="TupledData",field_names=["key",]+fieldNames)
	except ValueError as e:
		e.args = ('While tuple data,use "name" of archives as identity ID so they are expected Python valid identifiers.'+
							'You can use ".rename()" method to rename it and try this function again.'+"\n"+
							e.args[0],)
		raise e

	def align_tuple_data_to_frame(key,record,templet):

		if isinstance(record[0],list):
			frameSize = len(record[0][0])
		else:
			frameSize = len(record[0])

		for re in record[1:]:
			if isinstance(re,list):
				for sr in re:
					if len(sr) != frameSize:
						raise WrongOperation(f"Cannot tuple data with different frame length to frame level: {frameSize}!={len(sr)}.")
			else:
				if len(re) != frameSize:
					raise WrongOperation(f"Cannot tuple data with different frame length to frame level: {frameSize}!={len(re)}.")				
		
		result = []
		for frameIndex in range(frameSize):
			new = []
			for re in record:
				if isinstance(re,list):
					filedR = []
					for sr in re:
						filedR.append( sr[frameIndex] )
					new.append(filedR)
				else:
					new.append( re[frameIndex:frameIndex+1] )
					
			result.append(templet( key,frameIndex,*new  ))

		return result

	result = []
	for key in archives[0].keys():
		oneRecord = []
		for field in fieldNames:
			fieldData = []
			for ark in fields[field]:
				fieldData.append( ark.data[key] )
			if len(fieldData) == 1:
				fieldData = fieldData[0]
			oneRecord.append( fieldData )

		if frameLevel:
			result.extend( align_tuple_data_to_frame(key,oneRecord,templet) )
		else:
			result.append( templet(key,*oneRecord))
	
	return result