Example #1
0
def make_dependent_dirs(path, pathIsFile=True):
    '''
	Make the dependent directories for a path if it has not existed.

	Args:
		<path>: a file path or folder path.
		<pathIsFile>: a bool value to declare that <path> is a file path or folder path.
	'''
    declare.is_valid_string("path", path)
    declare.is_bool("pathIsFile", pathIsFile)

    path = os.path.abspath(path.strip())

    if pathIsFile:
        if os.path.isdir(path):
            raise WrongPath(
                f"<path> is specified as file but it has existed as directory: {path}. You can remove it then try again."
            )
        else:
            dirPath = os.path.dirname(path)
    else:
        if os.path.isfile(path):
            raise WrongPath(
                f"<path> is specified as directory but it has existed as file: {path}. You can remove it then try again."
            )
        else:
            dirPath = path

    if not os.path.isdir(dirPath):
        try:
            os.makedirs(dirPath)
        except Exception as e:
            print(f"Failed to make directory: {dirPath}.")
            raise e
Example #2
0
def make_dependent_dirs(path, pathIsFile=True):
    '''
	Make the dependent directories for a path if it has not existed.

	Args:
		<path>: a file path or folder path.
		<pathIsFile>: declare <path> if is a file path or folder path.
	'''
    assert isinstance(path, str), "<path> should be a string."
    path = os.path.abspath(path.strip())

    if pathIsFile:
        if os.path.isdir(path):
            raise WrongPath(
                f"<path> is specified as file but it has existed as directory: {path}. You can remove it then try again."
            )
        else:
            dirPath = os.path.dirname(path)
    else:
        if os.path.isfile(path):
            raise WrongPath(
                f"<path> is specified as directory but it has existed as file: {path}. You can remove it then try again."
            )
        else:
            dirPath = path

    if not os.path.isdir(dirPath):
        try:
            os.makedirs(dirPath)
        except Exception as e:
            print(f"Failed to make directory: {dirPath}.")
            raise e
Example #3
0
def decompress_gz_file(filePath, overWrite=False):
    '''
	Decompress a gz file.

	Args:
		<filePath>: file path.
	Return:
		the absolute path of decompressed file.
	'''
    assert isinstance(
        filePath,
        str), f"<filePath> must be a string but got {type_name(filePath)}."
    filePath = filePath.rstrip()
    if not os.path.isfile(filePath):
        raise WrongPath(f"No such file:{filePath}.")
    elif not filePath.endswith(".gz"):
        raise WrongOperation(f"{filePath}: Unknown suffix.")

    outFile = filePath[:-3]
    if overWrite is True and os.path.isfile(outFile):
        os.remove(outFile)

    cmd = f"gzip -d {filePath}"
    out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

    if cod != 0:
        print(err.decode())
        raise ShellProcessError("Failed to decompress file.")
    else:
        return os.path.abspath(outFile)
Example #4
0
def compress_gz_file(filePath, overWrite=False):
    '''
	Compress a file to gz file.

	Args:
		<filePath>: file path.
		<overWrite>: If True, overwrite gz file when it is existed.
	Return:
		the absolute path of compressed file.
	'''
    assert isinstance(
        filePath,
        str), f"<filePath> must be a string but got {type_name(filePath)}."
    filePath = filePath.strip()
    if not os.path.isfile(filePath):
        raise WrongPath(f"No such file:{filePath}.")

    outFile = filePath + ".gz"
    if overWrite is True and os.path.isfile(outFile):
        os.remove(outFile)

    cmd = f"gzip {filePath}"
    out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

    if cod != 0:
        print(err.decode())
        raise ShellProcessError("Failed to compress file.")
    else:
        return os.path.abspath(outFile)
Example #5
0
def arpa_to_binary(arpaFile, outFile):
    '''
	Transform ARPA language model file to KenLM binary format file.

	Args:
		<arpaFile>: ARPA file path.
		<outFile>: output binary file path.
	Return:
		Then absolute path of output file.
	'''
    assert isinstance(arpaFile, str), f"<arpaFile> should be a string."
    if not os.path.isfile(arpaFile):
        raise WrongPath(f"No such file:{arpaFile}.")

    assert isinstance(outFile, str), f"<outFile> should be a string."
    make_dependent_dirs(outFile)

    cmd = os.path.join(sys.prefix, "exkaldisrc", "tools", "build_binary")
    cmd += f" -s {arpaFile} {outFile}"
    out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

    if (cod != 0) or (not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                       == 0):
        print(err.decode())
        raise KenlmProcessError("Failed to tansform ARPA to binary format.")

    else:
        return os.path.abspath(outFile)
Example #6
0
def split_txt_file(filePath, chunks=2):
    '''
	Split a text file into N chunks by average numbers of lines.

	Args:
		<filePath>: text file path.
		<chunks>: chunk size.
	Return:
		a list of name of chunk files.
	'''
    assert isinstance(
        chunks, int
    ) and chunks > 1, "Expected <chunks> is int value and larger than 1."

    if not os.path.isfile(filePath):
        raise WrongPath(f"No such file:{filePath}.")

    with open(filePath, 'r', encoding='utf-8') as fr:
        data = fr.readlines()

    lines = len(data)
    chunkLines = lines // chunks

    if chunkLines == 0:
        chunkLines = 1
        chunks = lines
        t = 0
    else:
        t = lines - chunkLines * chunks

    a = len(str(chunks))
    files = []

    filePath = os.path.abspath(filePath)
    dirIndex = filePath.rfind('/')
    if dirIndex == -1:
        dirName = ""
        fileName = filePath
    else:
        dirName = filePath[:dirIndex + 1]
        fileName = filePath[dirIndex + 1:]

    suffixIndex = fileName.rfind('.')
    if suffixIndex != -1:
        newFile = dirName + fileName[0:suffixIndex] + f"_%0{a}d" + fileName[
            suffixIndex:]
    else:
        newFile = dirName + fileName + f"_%0{a}d"

    for i in range(chunks):
        if i < t:
            chunkData = data[i * (chunkLines + 1):(i + 1) * (chunkLines + 1)]
        else:
            chunkData = data[i * chunkLines:(i + 1) * chunkLines]
        with open(newFile % (i), 'w', encoding='utf-8') as fw:
            fw.write(''.join(chunkData))
        files.append(newFile % (i))

    return files
Example #7
0
def compute_cmvn_stats(feat, spk2utt=None, name="cmvn"):
    '''
	Compute CMVN statistics.

	Args:
		<feat>: exkaldi feature object.
		<spk2utt>: spk2utt file or exkaldi ScriptTable object.
		<name>: a string.

	Return:
		A exkaldi CMVN statistics object.
	'''
    ExkaldiInfo.vertify_kaldi_existed()

    if type_name(feat) == "BytesFeature":
        feat = feat.sort("utt")
    elif type_name(feat) == "NumpyFeature":
        feat = feat.sort("utt").to_bytes()
    else:
        raise UnsupportedType(
            f"Expected <feat> is a exkaldi feature object but got {type_name(feat)}."
        )

    spk2uttTemp = tempfile.NamedTemporaryFile("w+", encoding="utf-8")
    try:
        if spk2utt is None:
            cmd = 'compute-cmvn-stats ark:- ark:-'
        else:
            if isinstance(spk2utt, str):
                if not os.path.isfile(spk2utt):
                    raise WrongPath(f"No such file:{spk2utt}.")
                spk2uttSorted = ScriptTable(
                    name="spk2utt").load(spk2utt).sort()
                spk2uttSorted.save(spk2uttTemp)
            elif isinstance(spk2utt, ScriptTable):
                spk2uttSorted = spk2utt.sort()
                spk2uttSorted.save(spk2uttTemp)
            else:
                raise UnsupportedType(
                    f"<spk2utt> should be a file path or ScriptTable object but got {type_name(spk2utt)}."
                )
            spk2uttTemp.seek(0)

            cmd = f'compute-cmvn-stats --spk2utt=ark:{spk2uttTemp.name} ark:- ark:-'

        out, err, cod = run_shell_command(cmd,
                                          stdin=subprocess.PIPE,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE,
                                          inputs=feat.data)

        if (isinstance(cod, int) and cod != 0) or out == b'':
            print(err.decode())
            raise KaldiProcessError('Failed to compute CMVN statistics.')
        else:
            return BytesCMVNStatistics(out, name, indexTable=None)
    finally:
        spk2uttTemp.close()
Example #8
0
    def __init__(self, filePath, name="ngram"):
        assert isinstance(
            filePath,
            str), f"<filePath> should be string but got {type_name(filePath)}."
        if not os.path.isfile(filePath):
            raise WrongPath(f"No such file:{filePath}.")
        else:
            with open(filePath, "rb") as fr:
                t = fr.read(50).decode().strip()
            if t != "mmap lm http://kheafield.com/code format version 5":
                raise UnsupportedType(
                    "This is not a KenLM binary model formation.")

        super(KenNGrams, self).__init__(data=b"kenlm", name=name)
        self.__model = kenlm.Model(filePath)
Example #9
0
def transform_feat(feat, matrixFile):
    '''
	Transform feat by a transform matrix. Typically, LDA, MLLt matrixes.

	Args:
		<feat>: exkaldi feature object.
		<matrixFile>: file name.
	
	Return:
		a new exkaldi feature object.
	'''
    assert isinstance(
        matrixFile, str
    ), f"<transformMatrix> should be a file path but got: {type_name(matrixFile)}."
    if not os.path.isfile(matrixFile):
        raise WrongPath(f"No such file: {matrixFile}.")

    if type_name(feat) == "BytesFeature":
        bytesFlag = True
    elif type_name(feat) == "NumpyFeature":
        bytesFlag = False
        feat = feat.to_bytes()
    else:
        raise UnsupportedType(
            f"<feat> should exkaldi feature object but got: {type_name(feat)}."
        )

    cmd = f'transform-feats {matrixFile} ark:- ark:-'

    out, err, cod = run_shell_command(cmd,
                                      stdin=subprocess.PIPE,
                                      stdout=subprocess.PIPE,
                                      stderr=subprocess.PIPE,
                                      inputs=feat.data)

    if cod != 0:
        print(err.decode())
        raise KaldiProcessError("Failed to transform feature.")
    else:
        newName = f"tansform({feat.name})"
        newFeat = BytesFeature(out, name=newName)
        if bytesFlag:
            return newFeat
        else:
            return newFeat.to_numpy()
Example #10
0
def spk2utt_to_utt2spk(spk2uttFile, outFile):
    '''
	Transform spk2utt file to utt2spk file.

	Args:
		<spk2uttFile>: file name.
		<outFile>: file name.
	'''
    assert isinstance(
        spk2uttFile, str
    ), f"<spk2uttFile> should be a string but got: {type_name(spk2uttFile)}."
    assert isinstance(
        outFile,
        str), f"<outFile> should be a string but got: {type_name(outFile)}."

    if not os.path.isfile(spk2uttFile):
        raise WrongPath(f"No such file: {spk2uttFile}.")

    utt2spk = {}
    with open(spk2uttFile, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for index, line in enumerate(lines, start=1):
        line = line.strip().split()
        if len(line) == 0:
            continue
        else:
            if len(line) < 2:
                raise WrongDataFormat(
                    f"Mismatching between utt and spk: {line}.")
            spk = line[0]
            for utt in line[1:]:
                if utt in utt2spk.keys():
                    raise WrongDataFormat(
                        f"utt:{utt} is repeated in line {index}.")
                utt2spk[utt] = spk

    with open(outFile, "w") as fw:
        fw.write("\n".join(
            map(lambda utt, spk: utt + " " + spk, utt2spk.items())))

    return os.path.abspath(outFile)
Example #11
0
def list_files(fileName):
    '''
	List file paths.

	Args:
		<fileName>: a string.
	
	Return:
		A list of file paths.
	'''
    assert isinstance(fileName,
                      str), f"<fileName> should be string but got: {fileName}."

    cmd = f"ls {fileName}"
    out, err, cod = run_shell_command(cmd, stdout=subprocess.PIPE)

    if len(out) == 0:
        raise WrongPath(f"No such file: {fileName}.")
    else:
        out = out.decode().strip()
        return out.split("\n")
Example #12
0
def utt2spk_to_spk2utt(utt2spkFile, outFile):
    '''
	Transform utt2spk file to spk2utt file.

	Args:
		<utt2spkFile>: file name.
		<outFile>: file name.
	'''
    assert isinstance(
        utt2spkFile, str
    ), f"<utt2spkFile> should be a string but got: {type_name(utt2spkFile)}."
    assert isinstance(
        outFile,
        str), f"<outFile> should be a string but got: {type_name(outFile)}."

    if not os.path.isfile(utt2spkFile):
        raise WrongPath(f"No such file: {utt2spkFile}.")

    spk2utt = {}
    with open(utt2spkFile, "r", encoding="utf-8") as fr:
        lines = fr.readlines()
    for index, line in enumerate(lines, start=1):
        line = line.strip().split()
        if len(line) == 0:
            continue
        else:
            if len(line) != 2:
                raise WrongDataFormat(
                    f"Mismatching between utt and spk: {line}, line {index}.")
            utt, spk = line[0], line[1]
            if spk not in spk2utt.keys():
                spk2utt[spk] = f"{utt}"
            else:
                spk2utt[spk] += f" {utt}"

    with open(outFile, "w") as fw:
        fw.write("\n".join(
            map(lambda spk, utts: spk + " " + utts, spk2utt.items())))

    return os.path.abspath(outFile)
Example #13
0
def list_files(filePaths):
    '''
	List files by a normal grammar string.

	Args:
		<filePaths>: a string or list or tuple object.
	
	Return:
		A list of file paths.
	'''
    declare.is_classes("filePaths", filePaths, [str, list, tuple])

    def list_one_record(target):
        declare.is_valid_string("filePaths", target)
        cmd = f"ls {target}"
        out, err, cod = run_shell_command(cmd, stdout=subprocess.PIPE)
        if len(out) == 0:
            return []
        else:
            out = out.decode().strip().split("\n")
            newOut = [o for o in out if os.path.isfile(o)]
            return newOut

    if isinstance(filePaths, str):
        outFiles = list_one_record(filePaths)
    else:
        outFiles = []
        for m in filePaths:
            outFiles.extend(list_one_record(m))

    if len(outFiles) == 0:
        raise WrongPath(
            f"No any files have been found through the provided file paths: {filePaths}."
        )

    return outFiles
Example #14
0
def use_cmvn(feat, cmvn, utt2spk=None, std=False):
    '''
	Apply CMVN statistics to feature.

	Args:
		<feat>: exkaldi feature object.
		<cmvn>: exkaldi CMVN statistics object.
		<utt2spk>: utt2spk file path or ScriptTable object.
		<std>: If true, apply std normalization.

	Return:
		A new feature object.
	'''
    ExkaldiInfo.vertify_kaldi_existed()

    if type_name(feat) == "BytesFeature":
        feat = feat.sort(by="utt")
    elif type_name(feat) == "NumpyFeature":
        feat = feat.sort(by="utt").to_bytes()
    else:
        raise UnsupportedType(
            f"Expected exkaldi feature but got {type_name(feat)}.")

    if type_name(cmvn) == "BytesCMVNStatistics":
        cmvn = cmvn.sort(by="utt")
    elif type_name(cmvn) == "NumpyCMVNStatistics":
        cmvn = cmvn.sort(by="utt").to_bytes()
    else:
        raise UnsupportedType(
            f"Expected exkaldi CMVN statistics but got {type_name(cmvn)}.")

    cmvnTemp = tempfile.NamedTemporaryFile('wb+', suffix='_cmvn.ark')
    utt2spkTemp = tempfile.NamedTemporaryFile('w+',
                                              suffix="_utt2spk",
                                              encoding="utf-8")
    try:
        cmvnTemp.write(cmvn.data)
        cmvnTemp.seek(0)

        if std is True:
            stdOption = " --norm-vars true"
        else:
            stdOption = ""

        if utt2spk is None:
            cmd = f'apply-cmvn{stdOption} ark:{cmvnTemp.name} ark:- ark:-'
        else:
            if isinstance(utt2spk, str):
                if not os.path.isfile(utt2spk):
                    raise WrongPath(f"No such file:{utt2spk}.")
                utt2spkSorted = ScriptTable(
                    name="utt2spk").load(utt2spk).sort()
                utt2spkSorted.save(utt2spkTemp)
            elif isinstance(utt2spk, ScriptTable):
                utt2spkSorted = utt2spk.sort()
                utt2spkSorted.save(utt2spkTemp)
            else:
                raise UnsupportedType(
                    f"<utt2spk> should be a file path or ScriptTable object but got {type_name(utt2spk)}."
                )
            utt2spkTemp.seek(0)

            cmd = f'apply-cmvn{stdOption} --utt2spk=ark:{utt2spkTemp.name} ark:{cmvnTemp.name} ark:- ark:-'

        out, err, cod = run_shell_command(cmd,
                                          stdin=subprocess.PIPE,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE,
                                          inputs=feat.data)

        if (isinstance(cod, int) and cod != 0) or out == b'':
            print(err.decode())
            raise KaldiProcessError('Failed to apply CMVN statistics.')
        else:
            newName = f"cmvn({feat.name},{cmvn.name})"
            if type_name(feat) == "NumpyFeature":
                return BytesFeature(out, newName, indexTable=None).to_numpy()
            else:
                return BytesFeature(out, newName, indexTable=None)
    finally:
        cmvnTemp.close()
        utt2spkTemp.close()
Example #15
0
def train_ngrams_srilm(lexicons, order, textFile, outFile, config=None):
    '''
	Train n-grams language model with Srilm tookit.

	Args:
		<lexicons>: words.txt file path or Exkaldi LexiconBank object.
		<order>: the maxinum order of n-grams.
		<textFile>: text corpus file.
		<outFile>: ARPA out file name.
		<config>: configures, a Python dict object.

	You can use .check_config("train_ngrams_srilm") function to get configure information that you can set.
	Also you can run shell command "lmplz" to look their meaning.
	'''
    assert isinstance(
        order, int
    ) and order > 0 and order < 10, "Expected <n> is a positive int value and it must be smaller than 10."
    assert isinstance(textFile,
                      str), "Expected <textFile> is name-like string."
    assert isinstance(outFile, str), "Expected <outFile> is name-like string."
    assert type_name(
        lexicons
    ) == "LexiconBank", f"Expected <lexicons> is exkaldi LexiconBank object but got {type_name(lexicons)}."

    ExkaldiInfo.prepare_srilm()

    if not os.path.isfile(textFile):
        raise WrongPath(f"No such file:{textFile}")
    else:
        ## Should check the numbers of lines
        cmd = f"shuf {textFile} -n 100"
        out, err, cod = run_shell_command(cmd,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE)
        if (isinstance(cod, int) and cod != 0):
            print(err.decode())
            raise ShellProcessError("Failed to sample from text file.")
        elif out == b'':
            raise WrongDataFormat("Void text file.")
        else:
            out = out.decode().strip().split("\n")
            spaceCount = 0
            for line in out:
                spaceCount += line.count(" ")
            if spaceCount < len(out) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or extremely short."
                )

    wordlist = tempfile.NamedTemporaryFile("w+",
                                           encoding='utf-8',
                                           suffix=".txt")
    unkSymbol = lexicons("oov")
    try:
        lexiconp = lexicons("lexiconp")
        words = [x[0] for x in lexiconp.keys()]
        wordlist.write("\n".join(words))
        wordlist.seek(0)

        #cmd2 = f"ngram-count -text {textFile} -order {order}"
        extraConfig = " "
        specifyDiscount = False
        if config is not None:
            if check_config(name='train_ngrams_srilm', config=config):
                for key, value in config.items():
                    if isinstance(value, bool):
                        if value is True:
                            extraConfig += f"{key} "
                        if key.endswith("discount"):
                            specifyDiscount = True
                    else:
                        extraConfig += f" {key} {value}"

        cmd = f"ngram-count -text {textFile} -order {order} -limit-vocab -vocab {wordlist.name} -unk -map-unk {unkSymbol} "
        if specifyDiscount is False:
            cmd += "-kndiscount "
        cmd += "-interpolate "

        if not outFile.rstrip().endswith(".arpa"):
            outFile += ".arpa"
        make_dependent_dirs(outFile, pathIsFile=True)

        cmd += f" -lm {outFile}"

        out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0:
            print(err.decode())
            if os.path.isfile(outFile):
                os.remove(outFile)
            raise KaldiProcessError(
                f'Failed to generate ngrams language model.')
        else:
            return os.path.abspath(outFile)

    finally:
        wordlist.close()
Example #16
0
def __compute_feature(wavFile, kaldiTool, useSuffix=None, name="feat"):

    if useSuffix != None:
        assert isinstance(useSuffix, str), "Expected <useSuffix> is a string."
        useSuffix = useSuffix.strip().lower()[-3:]
    else:
        useSuffix = ""
    assert useSuffix in ["", "scp",
                         "wav"], 'Expected <useSuffix> is "scp" or "wav".'

    ExkaldiInfo.vertify_kaldi_existed()

    wavFileTemp = tempfile.NamedTemporaryFile("w+",
                                              suffix=".scp",
                                              encoding="utf-8")
    try:
        if isinstance(wavFile, str):
            if os.path.isdir(wavFile):
                raise WrongOperation(
                    f'Expected <wavFile> is file path but got a directory:{wavFile}.'
                )
            else:
                out, err, cod = run_shell_command(f'ls {wavFile}',
                                                  stdout=subprocess.PIPE,
                                                  stderr=subprocess.PIPE)
                if out == b'':
                    raise WrongPath(f"No such file:{wavFile}.")
                else:
                    allFiles = out.decode().strip().split('\n')
        elif isinstance(wavFile, ScriptTable):
            wavFile = wavFile.sort()
            wavFile.save(wavFileTemp)
            allFiles = [
                wavFileTemp.name,
            ]
        else:
            raise UnsupportedType(
                f'Expected filename-like string but got a {type_name(wavFile)}.'
            )

        results = []
        for wavFile in allFiles:
            wavFile = os.path.abspath(wavFile)
            if wavFile[-3:].lower() == "wav":
                dirName = os.path.dirname(wavFile)
                fileName = os.path.basename(wavFile)
                uttID = "".join(fileName[0:-4].split("."))
                cmd = f"echo {uttID} {wavFile} | {kaldiTool} scp,p:- ark:-"
            elif wavFile[-3:].lower() == 'scp':
                cmd = f"{kaldiTool} scp,p:{wavFile} ark:-"
            elif "wav" in useSuffix:
                dirName = os.path.dirname(wavFile)
                fileName = os.path.basename(wavFile)
                uttID = "".join(fileName[0:-4].split("."))
                cmd = f"echo {uttID} {wavFile} | {kaldiTool} scp,p:- ark:-"
            elif "scp" in useSuffix:
                cmd = f"{kaldiTool} scp,p:{wavFile} ark:-"
            else:
                raise UnsupportedType(
                    'Unknown file suffix. You can declare it by making <useSuffix> "wav" or "scp".'
                )

            out, err, cod = run_shell_command(cmd,
                                              stdout=subprocess.PIPE,
                                              stderr=subprocess.PIPE)
            if (isinstance(out, int) and cod != 0) or out == b'':
                print(err.decode())
                raise KaldiProcessError(f'Failed to compute feature:{name}.')
            else:
                results.append(BytesFeature(out))
    finally:
        wavFileTemp.close()

    if len(results) == 0:
        raise WrongOperation("No any feature date in file path.")
    else:
        result = results[0]
        for i in results[1:]:
            result += i
        result.rename(name)
        return result
Example #17
0
def gmm_decode(feat, hmm, HCLGFile, wordSymbolTable, beam=10, latBeam=8, acwt=1,
				minActive=200, maxActive=7000, maxMem=50000000, config=None, maxThreads=1):
	'''
	Decode by generating lattice from feature and GMM model.

	Args:
		<feat>: An exkaldi feature object.
		<hmm>: An exkaldi HMM object or file path.
		<HCLGFile>: HCLG file path.
		<wordSymbolTable>: words.txt file path or exkaldi LexiconBank object or exkaldi ListTable object.
		<beam>: beam size.
		<latBeam>: lattice beam size.
		<acwt>: acoustic model weight.
		<minActivate>: .
		<maxActive>: .
		<maxMem>: .
		<config>: decode configure file.
		<maxThreads>: the number of mutiple threads.
		
		Some usual options can be assigned directly. If you want use more, set <config> = your-configure, but if you do this, these usual configures we provided will be ignored.
		You can use .check_config('gmm_decode') function to get configure information you could set.
		Also run shell command "gmm-latgen-faster" to look their meaning.
	Return:
		An exkaldi Lattice object.
	''' 
	ExkaldiInfo.vertify_kaldi_existed()

	if type_name(feat) == "BytesFeature":
		pass
	elif type_name(feat) == "NumpyFeature":
		feat = feat.to_bytes()
	else:
		raise UnsupportedType(f"Expected <feat> is an exkaldi feature object but got: {type_name(feat)}.")
		
	assert isinstance(HCLGFile, str), "<HCLGFile> should be a file path."
	if not os.path.isfile(HCLGFile):
		raise WrongPath(f"No such file:{HCLGFile}")

	if maxThreads > 1:
		kaldiTool = f"gmm-latgen-faster-parallel --num-threads={maxThreads} "
	else:
		kaldiTool = "gmm-latgen-faster " 

	kaldiTool += f'--allow-partial=true '
	kaldiTool += f'--min-active={minActive} '
	kaldiTool += f'--max-active={maxActive} '  
	kaldiTool += f'--max_mem={maxMem} '
	kaldiTool += f'--beam={beam} '
	kaldiTool += f'--lattice-beam={latBeam} '
	kaldiTool += f'--acoustic-scale={acwt} '

	wordsTemp = tempfile.NamedTemporaryFile("w+", suffix="_words.txt", encoding="utf-8")
	modelTemp = tempfile.NamedTemporaryFile("wb+", suffix=".mdl")

	try:
		if type_name(wordSymbolTable) == "LexiconBank":
			wordSymbolTable.dump_dict("words", wordsTemp)
			wordsFile = wordsTemp.name
		elif type_name(wordSymbolTable) == "ListTable":
			wordSymbolTable.save(wordsTemp)
			wordsTemp.seek(0)
			wordsFile = wordsTemp.name
		elif isinstance(wordSymbolTable, str):
			if not os.path.isfile(wordSymbolTable):
				raise WrongPath(f"No such file:{wordSymbolTable}.")
			else:
				wordsFile = wordSymbolTable
		else:
			raise UnsupportedType(f"<wordSymbolTable> should be a file path or exkaldi LexiconBank object but got {type_name(wordSymbolTable)}.")

		kaldiTool += f'--word-symbol-table={wordsFile} '

		if config is not None:
			if check_config(name='gmm_decode', config=config):
				for key,value in config.items():
					if isinstance(value, bool):
						if value is True:
							kaldiTool += f"{key} "
					else:
						kaldiTool += f" {key}={value}"

		if type_name(hmm) in ["MonophoneHMM", "TriphoneHMM"]:
			modelTemp.write(hmm.data)
			modelTemp.seek(0)
			hmmFile = modelTemp.name
		elif isinstance(hmm, str):
			if not os.path.isfile(hmm):
				raise WrongPath(f"No such file:{hmm}.")
			else:
				hmmFile = hmm
		else:
			raise UnsupportedType(f"<hmm> should be exkaldi HMM object or file path but got {type_name(hmm)}.")
		
		cmd = f'{kaldiTool} {hmmFile} {HCLGFile} ark:- ark:-'
		out, err, cod = run_shell_command(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, inputs=feat.data)

		if cod != 0 or out == b'':
			print(err.decode())
			raise KaldiProcessError('Failed to generate lattice.')
		else:
			newName = f"lat({feat.name})"
			return Lattice(data=out, name=newName)
	
	finally:
		wordsTemp.close()
		modelTemp.close()
Example #18
0
def wer(ref, hyp, ignore=None, mode='all'):
    '''
	Compute WER (word error rate) between <ref> and <hyp>. 

	Args:
		<ref>, <hyp>: exkaldi transcription object or file path.
		<ignore>: ignore a symbol.
		<mode>: "all" or "present".
	Return:
		a namedtuple of score information.
	'''
    assert mode in ['all',
                    'present'], 'Expected <mode> to be "present" or "all".'
    ExkaldiInfo.vertify_kaldi_existed()

    hypTemp = tempfile.NamedTemporaryFile("w+",
                                          suffix=".txt",
                                          encoding="utf-8")
    refTemp = tempfile.NamedTemporaryFile("w+",
                                          suffix=".txt",
                                          encoding="utf-8")
    try:
        if ignore is None:
            if type_name(hyp) == "Transcription":
                hyp.save(hypTemp)
                hypTemp.seek(0)
                hypFileName = hypTemp.name
            elif isinstance(hyp, str):
                if not os.path.isfile(hyp):
                    raise WrongPath(f"No such file:{hyp}.")
                else:
                    hypFileName = hyp
            else:
                raise UnsupportedType(
                    '<hyp> should be exkaldi Transcription object or file path.'
                )

            if type_name(ref) == "Transcription":
                ref.save(refTemp)
                refTemp.seek(0)
                refFileName = refTemp.name
            elif isinstance(ref, str):
                if not os.path.isfile(ref):
                    raise WrongPath(f"No such file:{ref}.")
                else:
                    refFileName = ref
            else:
                raise UnsupportedType(
                    '<ref> should be exkaldi Transcription object or file path.'
                )

            cmd = f'compute-wer --text --mode={mode} ark:{refFileName} ark,p:{hypFileName}'
            scoreOut, scoreErr, _ = run_shell_command(cmd,
                                                      stdout=subprocess.PIPE,
                                                      stderr=subprocess.PIPE)
        else:
            if type_name(hyp) == "Transcription":
                hyp = hyp.save()
            elif isinstance(hyp, str):
                if not os.path.isfile(hyp):
                    raise WrongPath(f"No such file:{hyp}.")
                else:
                    with open(hyp, "r", encoding="utf-8") as fr:
                        hyp = fr.read()
            else:
                raise UnsupportedType(
                    '<hyp> should be exkaldi Transcription object or file path.'
                )

            cmd = f'sed "s/{ignore} //g" > {hypTemp.name}'
            hypOut, err, _ = run_shell_command(cmd,
                                               stdin=subprocess.PIPE,
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.PIPE,
                                               inputs=hyp.encode())
            if len(hypOut) == 0:
                print(err.decode())
                raise WrongDataFormat("<hyp> has wrong data formation.")

            if type_name(ref) == "Transcription":
                ref = ref.save()
            elif isinstance(ref, str):
                if not os.path.isfile(ref):
                    raise WrongPath(f"No such file:{ref}.")
                else:
                    with open(ref, "r", encoding="utf-8") as fr:
                        ref = fr.read()
            else:
                raise UnsupportedType(
                    '<ref> should be exkaldi Transcription object or file path.'
                )

            cmd = f'sed "s/{ignore} //g" > {refTemp.name}'
            refOut, err, cod = run_shell_command(cmd,
                                                 stdin=subprocess.PIPE,
                                                 stdout=subprocess.PIPE,
                                                 stderr=subprocess.PIPE,
                                                 inputs=hyp.encode())
            if cod != 0 or len(refOut) == 0:
                print(err.decode())
                raise WrongDataFormat("<ref> has wrong data formation.")

            cmd = f'compute-wer --text --mode={mode} ark:{refTemp.name} ark,p:{hypTemp.name}'
            scoreOut, scoreErr, _ = run_shell_command(cmd,
                                                      stdout=subprocess.PIPE,
                                                      stderr=subprocess.PIPE)

    finally:
        hypTemp.close()
        refTemp.close()

    if len(scoreOut) == 0:
        print(scoreErr.decode())
        raise KaldiProcessError("Failed to compute WER.")

    else:
        out = scoreOut.decode().split("\n")
        pattern1 = '%WER (.*) \[ (.*) \/ (.*), (.*) ins, (.*) del, (.*) sub \]'
        pattern2 = "%SER (.*) \[ (.*) \/ (.*) \]"
        pattern3 = "Scored (.*) sentences, (.*) not present in hyp."
        s1 = re.findall(pattern1, out[0])[0]
        s2 = re.findall(pattern2, out[1])[0]
        s3 = re.findall(pattern3, out[2])[0]

        return namedtuple("Score", [
            "WER", "words", "insErr", "delErr", "subErr", "SER", "sentences",
            "wrongSentences", "missedSentences"
        ])(
            float(s1[0]),  #WER
            int(s1[2]),  #words
            int(s1[3]),  #ins
            int(s1[4]),  #del
            int(s1[5]),  #sub
            float(s2[0]),  #SER
            int(s2[1]),  #sentences
            int(s2[2]),  #wrong sentences
            int(s3[1])  #missed sentences
        )
Example #19
0
def edit_distance(ref, hyp, ignore=None, mode='present'):
    '''
	Compute edit-distance score.

	Args:
		<ref>, <hyp>: Transcription objects or iterable objects like list, tuple or NumPy array. It will be flattened before scoring.
		<ignore>: Ignoring specific symbols.
		<mode>: When both are Transcription objects, if mode is 'present', skip the missed utterances.
	Return:
		a namedtuple object including score information.	
	'''
    if type_name(ref) == "Transcription":
        pass
    elif isinstance(ref, str):
        if not os.path.isfile(ref):
            raise WrongPath(f"No such file:{ref}.")
        else:
            ref = load_trans(ref)
    else:
        raise UnsupportedType(
            '<ref> should be exkaldi Transcription object or file path.')

    if type_name(hyp) == "Transcription":
        pass
    elif isinstance(hyp, str):
        if not os.path.isfile(hyp):
            raise WrongPath(f"No such file:{hyp}.")
        else:
            hyp = load_trans(hyp)
    else:
        raise UnsupportedType(
            '<hyp> should be exkaldi Transcription object or file path.')

    allED = 0
    words = 0
    SER = 0
    sentences = 0
    wrongSentences = 0
    missedSentences = 0

    ref = ref.sort()
    hyp = hyp.sort()
    for utt, hypTrans in hyp.items():
        try:
            refTrans = ref[utt]
        except KeyError as e:
            if mode == "all":
                raise Exception(
                    "Missing transcription in reference, set <mode> as 'all' to skip it."
                )
            else:
                missedSentences += 1
        else:
            sentences += 1
            refTrans = refTrans.split()
            hypTrans = hypTrans.split()
            ed, wds = pure_edit_distance(refTrans, hypTrans, ignore=ignore)
            allED += ed
            words += wds
            if ed > 0:
                wrongSentences += 1
    if sentences == 0:
        raise Exception("Missing all transcription in reference.")

    return namedtuple("Score", [
        "editDistance", "words", "SER", "sentences", "wrongSentences",
        "missedSentences"
    ])(allED, words, wrongSentences / sentences, sentences, wrongSentences,
       missedSentences)
Example #20
0
def ctc_prefix_beam_search(prob,
                           vocabs,
                           blankID=None,
                           beam=5,
                           cutoff=0.999,
                           strick=1.0,
                           lmFile=None,
                           alpha=1.0,
                           beta=0):
    '''
    Prefix beam search decoding algorithm. Lm score is supported.

    Args:
        <prob>: An exkaldi postprobability object. This probalility should be an output of Neural Network with CTC loss fucntion.
                We expect the probability didn't pass any activation function, or it may generate wrong results.
        <vocabs>: a list of vocabulary.
        <blankID>: specify the ID of blank symbol. If None, use the last dimentionality of <prob>.
        <beam>: the beam size.
        <cutoff>: the sum threshold to cut off dimensions whose probability is extremely small.  
        <strick>: When the decoding results of two adjacent frames are the same, the probability of latter will be reduced.
        <lmFile>: If not None, add language model score to beam.
        <alpha>: the weight of LM score.
        <beta>: the length normaoliztion weight of LM score.
    Return:
        An exkaldi Transcription object of decoding results.  
    '''
    assert isinstance(
        vocabs,
        (list,
         tuple)), f"<vocabs> must be a list of vocabulary but got {vocabs}."

    if type_name(prob) == "BytesPostProbability":
        prob = prob.to_numpy()
    elif type_name(prob) == "NumpyPostProbability":
        pass
    else:
        raise UnsupportedType(
            f"<prob> should be an exkaldi probability object but got {type_name(prob)}."
        )

    if lmFile is not None:
        assert isinstance(lmFile, str) and len(
            lmFile) > 0, "Language model file path is unavaliable."
        if not os.path.isfile(lmFile):
            raise WrongPath(f"No such file:{lmFile}.")
    else:
        lmFile = "none"

    probDim = prob.dims
    if len(vocabs) == probDim:
        if blankID is None:
            blankID = probDim - 1
        else:
            assert isinstance(
                blankID, int
            ) and 0 <= blankID < probDim, f"BlankID {blankID} is out of range of int sequences from 0 to {probDim-1}."
    elif len(vocabs) == probDim - 1:
        if blankID == None:
            blankID = probDim - 1
        else:
            assert blankID == probDim - 1, f"The dimensibality of probability is {probDim} but only have {len(vocabs)} words. In this case, blank ID must be {probDim-1} but got {blankID}"
    else:
        raise WrongDataFormat(
            f"The dimensibality of probability {probDim} does not match the numbers of words {len(vocabs)}."
        )

    for ID, word in enumerate(vocabs):
        if len(word.strip()) == 0:
            raise WrongDataFormat(f"Found a vocab {word} unavaliable.")

    num_classes = len(vocabs)
    vocabs = " ".join(vocabs)

    sources = [
        vocabs.encode(),
    ]
    uttTemp = []
    for utt, pb in prob.items:
        assert isinstance(pb, np.ndarray) and len(
            pb.shape) == 2, "Unsupported probability matrix formatation."
        pb = softmax(pb, axis=1)
        sources.append(f" {pb.shape[0]} ".encode() +
                       pb.astype("float32").tobytes())

    sources = b"".join(sources)

    cmd = os.path.join(sys.prefix, "exkaldisrc", "tools",
                       "prefix_beam_search_decode")
    cmd += " --num_files {}".format(prob.lens[0])
    cmd += " --num_classes {}".format(num_classes)
    cmd += " --blank_id {}".format(blankID)
    cmd += " --lm_model {}".format(lmFile)
    cmd += " --beam_size {}".format(beam)
    cmd += " --cutoff_prob {}".format(cutoff)
    cmd += " --alpha {}".format(alpha)
    cmd += " --beta {}".format(beta)

    out, err, _ = run_shell_command(cmd,
                                    stdin=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    inputs=sources)

    if len(out) == 0:
        print(err.decode())
        raise Exception("Failed to beam search decode.")
    else:
        results = Transcription(name="beamSearchResults")
        out = out.decode().strip().split("file")
        results = []
        for index, re in enumerate(out[1:]):
            re = re.strip().split("\n")
            if len(re) <= 1:
                results.append([
                    "",
                ])
            else:
                results[uttTemp[index]] = " ".join(re[1].strip().split()[1:])

        return results
Example #21
0
def load_ali(target, aliType=None, name="ali", hmm=None):
    '''
	Load alignment data.

	Args:
		<target>: Python dict object, bytes object, exkaldi alignment object, kaldi alignment file or .npy file.
		<aliType>: None, or one of 'transitionID', 'phoneID', 'pdfID'. It will return different alignment object.
		<name>: a string.
		<hmm>: file path or exkaldi HMM object.
	Return:
		exkaldi alignment data objects.
	'''
    assert isinstance(
        name, str) and len(name) > 0, "Name shoud be a string avaliable."

    ExkaldiInfo.vertify_kaldi_existed()

    def transform(data, cmd):
        out, err, cod = run_shell_command(cmd,
                                          stdin=subprocess.PIPE,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE,
                                          inputs=data)
        if (isinstance(cod, int) and cod != 0) and out == b'':
            print(err.decode())
            raise KaldiProcessError('Failed to transform alignment.')
        else:
            result = {}
            sp = BytesIO(out)
            for line in sp.readlines():
                line = line.decode()
                line = line.strip().split()
                utt = line[0]
                matrix = np.array(line[1:], dtype=np.int32)
                result[utt] = matrix
            return results

    if isinstance(target, dict):
        if aliType is None:
            result = NumpyAlignment(target, name)
        elif aliType == "transitionID":
            result = NumpyAlignmentTrans(target, name)
        elif aliType == "phoneID":
            result = NumpyAlignmentPhone(target, name)
        elif aliType == "pdfID":
            result = NumpyAlignmentPdf(target, name)
        else:
            raise WrongOperation(
                f"<aliType> should be None, 'transitionID', 'phoneID' or 'pdfID' but got {aliType}."
            )
        result.check_format()
        return result

    elif type_name(target) in [
            "NumpyAlignment", "NumpyAlignmentTrans", "NumpyAlignmentPhone",
            "NumpyAlignmentPdf", "BytesAlignmentTrans"
    ]:
        result = copy.deepcopy(target)
        result.rename(name)
        return result

    elif isinstance(target, str):

        allFiles = list_files(target)

        results = {
            "NumpyAlignment": NumpyAlignment(),
            "NumpyAlignmentTrans": NumpyAlignmentTrans(),
            "NumpyAlignmentPhone": NumpyAlignmentPhone(),
            "NumpyAlignmentPdf": NumpyAlignmentPdf(),
            "BytesAlignmentTrans": BytesAlignmentTrans(),
        }

        for fileName in allFiles:
            fileName = os.path.abspath(fileName)

            if fileName.endswith(".npy"):
                temp = __read_data_from_file(fileName, "npy")
                if aliType is None:
                    temp = NumpyAlignment(temp.data)
                    results["NumpyAlignment"] += temp
                elif aliType == "transitionID":
                    temp = NumpyAlignmentTrans(temp.data)
                    results["NumpyAlignmentTrans"] += temp
                elif aliType == "phoneID":
                    temp = NumpyAlignmentPhone(temp.data)
                    results["NumpyAlignmentPhone"] += temp
                elif aliType == "pdfID":
                    temp = NumpyAlignmentPdf(temp.data)
                    results["NumpyAlignmentPdf"] += temp
                else:
                    raise WrongOperation(
                        f"<aliType> should be None, 'transitionID','phoneID' or 'pdfID' but got {aliType}."
                    )

            else:
                if fileName.endswith('.gz'):
                    cmd = f'gunzip -c {fileName}'
                else:
                    cmd = f'cat {fileName}'

                if aliType is None or aliType == "transitionID":
                    out, err, cod = run_shell_command(cmd,
                                                      stdout=subprocess.PIPE,
                                                      stderr=subprocess.PIPE)
                    if (isinstance(cod, int) and cod != 0) or out == b'':
                        print(err.decode())
                        raise ShellProcessError(
                            "Failed to get the alignment data from file.")
                    else:
                        temp = BytesAlignmentTrans(out)
                        results["BytesAlignmentTrans"] += temp

                else:
                    temp = tempfile.NamedTemporaryFile("wb+")
                    try:
                        if type_name(hmm) in ("HMM", "MonophoneHMM",
                                              "TriphoneHMM"):
                            hmm.save(temp)
                            hmmFileName = temp.name
                        elif isinstance(hmm, str):
                            if not os.path.isfile(hmm):
                                raise WrongPath(f"No such file:{hmm}.")
                            hmmFileName = hmm
                        else:
                            raise UnsupportedType(
                                f"<hmm> should be a filePath or exkaldi HMM and its sub-class object. but got {type_name(hmm)}."
                            )

                        if aliType == "phoneID":
                            cmd += f" | ali-to-phones --per-frame=true {hmmFileName} ark:- ark,t:-"
                            temp = transform(None, cmd)
                            temp = NumpyAlignmentPhone(temp)
                            results["NumpyAlignmentPhone"] += temp

                        elif target == "pdfID":
                            cmd = f" | ali-to-pdf {hmmFileName} ark:- ark,t:-"
                            temp = transform(None, cmd)
                            temp = NumpyAlignmentPdf(temp)
                            results["NumpyAlignmentPdf"] += temp
                        else:
                            raise WrongOperation(
                                f"<target> should be 'trainsitionID', 'phoneID' or 'pdfID' but got {target}."
                            )

                    finally:
                        temp.close()

        finalResult = []
        for obj in results.values():
            if not obj.is_void:
                obj.rename(name)
                finalResult.append(obj)

        if len(finalResult) == 0:
            raise WrongOperation(
                "<target> dose not include any data avaliable.")
        elif len(finalResult) == 1:
            finalResult = finalResult[0]

        return finalResult
Example #22
0
def train_ngrams_kenlm(lexicons, order, textFile, outFile, config=None):
    '''
	Train n-grams language model with KenLm tookit.

	Args:
		<lexicons>: words.txt file path or Exkaldi LexiconBank object.
		<order>: the maxinum order of n-grams.
		<textFile>: text corpus file.
		<outFile>: ARPA out file name.
		<config>: configures, a Python dict object.

	You can use .check_config("train_ngrams_kenlm") function to get configure information that you can set.
	Also you can run shell command "lmplz" to look their meaning.
	'''
    assert isinstance(
        order, int
    ) and 0 < order <= 6, "We support maximum 6-grams LM in current version."

    if not os.path.isfile(textFile):
        raise WrongPath("No such file:{}".format(textFile))
    else:
        ## Should check the numbers of lines
        cmd = f"shuf {textFile} -n 100"
        out, err, cod = run_shell_command(cmd,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE)
        if (isinstance(cod, int) and cod != 0):
            print(err.decode())
            raise ShellProcessError("Failed to sample from text file.")
        elif out == b'':
            raise WrongDataFormat("Void text file.")
        else:
            out = out.decode().strip().split("\n")
            spaceCount = 0
            for line in out:
                spaceCount += line.count(" ")
            if spaceCount < len(out) // 2:
                raise WrongDataFormat(
                    "The text file doesn't seem to be separated by spaces or extremely short."
                )

    extraConfig = " "
    if config != None:
        assert isinstance(
            config, dict
        ), f"<config> should be dict object but got: {type_name(config)}."
        if check_config(name='train_ngrams_kenlm', config=config):
            if "--temp_prefix" in config.keys() and "-T" in config.keys():
                raise WrongOperation(
                    f'"--temp_prefix" and "-T" is the same configure so only one of them is expected.'
                )
            if "--memory" in config.keys() and "-S" in config.keys():
                raise WrongOperation(
                    f'"--memory" and "-S" is the same configure so only one of them is expected.'
                )
            for key, value in config.items():
                if isinstance(value, bool):
                    if value is True:
                        extraConfig += f"{key} "
                else:
                    extraConfig += f"{key} {value} "

    assert isinstance(outFile, str), f"<outFile> should be a string."
    if not outFile.rstrip().endswith(".arpa"):
        outFile += ".arpa"
    make_dependent_dirs(outFile, pathIsFile=True)

    words = tempfile.NamedTemporaryFile("w+", suffix=".txt", encoding="utf-8")
    try:
        if type_name(lexicons) == "LexiconBank":
            ws = lexicons("words")
            words_count = math.ceil(len(ws) / 10) * 10
            ws = "\n".join(ws.keys())
        elif isinstance(lexicons, str):
            if not os.path.isfile(lexicons):
                raise WrongPath(f"No such file:{lexicons}.")
            with open(lexicons, "r", encoding="utf-8") as fr:
                lines = fr.readlines()
            ws = []
            for line in lines:
                line = line.strip().split(maxsplit=1)
                if len(line) < 1:
                    continue
                else:
                    ws.append(line[0])
            words_count = math.ceil(len(ws) / 10) * 10
            ws = "\n".join(ws)
        else:
            raise UnsupportedType(
                "<lexicons> should be LexiconBank object or file path.")

        words.write(ws)
        words.seek(0)

        KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz")

        cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {textFile} --arpa {outFile} --limit_vocab_file {words.name}"
        out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE)

        if (isinstance(cod, int) and cod != 0) or (
                not os.path.isfile(outFile)) or (os.path.getsize(outFile)
                                                 == 0):
            print(err.decode())
            raise KenlmProcessError("Failed to generate arpa file.")
        else:
            return os.path.abspath(outFile)

    finally:
        words.close()