def compute_fbank(target,rate=16000,frameWidth=25,frameShift=10, melBins=23,windowType='povey',useSuffix=None, config=None,name="fbank",outFile=None): ''' Compute fbank feature. Share Args: Null Parallel Args: <target>: wave file,scp file,exkaldi ListTable object or WavSegment object. If it is wave file,we will use it's file name as utterance ID. <rate>: sample rate. <frameWidth>: windows width (ms). <frameShift>: shift windows width (ms). <melbins>: the numbers of mel filter banks. <windowType>: windows type. <useSuffix>: If the suffix of file is not .scp or .wav,use this to specify it. <config>: extra optional configurations. <name>: the name of output feature. <outFile>: output file name. Some usual options can be assigned directly. If you want use more,set <config> = your-configure. You can use exkaldi.check_config('compute_fbank') function to get the reference of extra configurations. Also you can run shell command "compute-fbank-feats" to look their usage. Return: exkaldi feature or index table object. ''' # check the basis configure parameters to build base commands stdParameters = check_multiple_resources(rate,frameWidth,frameShift,melBins,windowType,config) baseCmds = [] for rate,frameWidth,frameShift,melBins,windowType,config,_ in zip(*stdParameters): declare.is_positive_int("rate",rate) declare.is_positive_int("frameWidth",frameWidth) declare.is_positive_int("frameShift",frameShift) declare.is_positive_int("melBins",melBins) declare.greater_equal("frameWidth",frameWidth,"frameShift",frameShift) declare.is_instances("windowType",windowType,["hamming","hanning","povey","rectangular","blackmann"]) kaldiTool = 'compute-fbank-feats --allow-downsample --allow-upsample ' kaldiTool += f'--sample-frequency={rate} ' kaldiTool += f'--frame-length={frameWidth} ' kaldiTool += f'--frame-shift={frameShift} ' kaldiTool += f'--num-mel-bins={melBins} ' kaldiTool += f'--window-type={windowType} ' if config is not None: if check_config(name='compute_fbank',config=config): for key,value in config.items(): if isinstance(value,bool): if value is True: kaldiTool += f"{key} " else: kaldiTool += f"{key}={value} " baseCmds.append(kaldiTool) # run the common function return __compute_feature(target,baseCmds,useSuffix,name,outFile)
def compute_fbank(wavFile, rate=16000, frameWidth=25, frameShift=10, melBins=23, windowType='povey', useSuffix=None, config=None, name="fbank"): ''' Compute fbank feature. Args: <wavFile>: wave file or scp file or exkaldi SriptTable object. If it is wave file, use it's file name as utterance ID. <rate>: sample rate. <frameWidth>: sample windows width. <frameShift>: shift windows width. <melbins>: the numbers of mel filter banks. <windowType>: sample windows type. <useSuffix>: If the suffix of file is not .scp, use this to specify it. <config>: use this to assign more configures. If use it, all these configures above will be skipped. <name>: the name of feature. Some usual options can be assigned directly. If you want use more, set <config> = your-configure, but if you do this, these usual configures we provided will be ignored. You can use .check_config('compute_fbank') function to get configure information that you can set. Also you can run shell command "compute-fbank-feats" to look their meaning. Return: A exkaldi bytes feature object. ''' assert isinstance( frameWidth, int ) and frameWidth > 0, f"<frameWidth> should be a positive int value but got {type_name(frameWidth)}." assert isinstance( frameShift, int ) and frameShift > 0, f"<frameShift> should be a positive int value but got {type_name(frameShift)}." assert frameWidth > frameShift, f"<frameWidth> and <frameShift> is unavaliable." assert windowType in [ "hamming", "hanning", "povey", "rectangular", "blackmann" ], f'<windowType> should be "hamming","hanning","povey","rectangular","blackmann", but got: {windowType}.' kaldiTool = 'compute-fbank-feats --allow-downsample --allow-upsample ' kaldiTool += f'--sample-frequency={rate} ' kaldiTool += f'--frame-length={frameWidth} ' kaldiTool += f'--frame-shift={frameShift} ' kaldiTool += f'--num-mel-bins={melBins} ' kaldiTool += f'--window-type={windowType} ' if config is not None: if check_config(name='compute_fbank', config=config): for key, value in config.items(): if isinstance(value, bool): if value is True: kaldiTool += f"{key} " else: kaldiTool += f" {key}={value}" result = __compute_feature(wavFile, kaldiTool, useSuffix, name) return result
def gmm_decode(feat, hmm, HCLGFile, wordSymbolTable, beam=10, latBeam=8, acwt=1, minActive=200, maxActive=7000, maxMem=50000000, config=None, maxThreads=1): ''' Decode by generating lattice from feature and GMM model. Args: <feat>: An exkaldi feature object. <hmm>: An exkaldi HMM object or file path. <HCLGFile>: HCLG file path. <wordSymbolTable>: words.txt file path or exkaldi LexiconBank object or exkaldi ListTable object. <beam>: beam size. <latBeam>: lattice beam size. <acwt>: acoustic model weight. <minActivate>: . <maxActive>: . <maxMem>: . <config>: decode configure file. <maxThreads>: the number of mutiple threads. Some usual options can be assigned directly. If you want use more, set <config> = your-configure, but if you do this, these usual configures we provided will be ignored. You can use .check_config('gmm_decode') function to get configure information you could set. Also run shell command "gmm-latgen-faster" to look their meaning. Return: An exkaldi Lattice object. ''' ExkaldiInfo.vertify_kaldi_existed() if type_name(feat) == "BytesFeature": pass elif type_name(feat) == "NumpyFeature": feat = feat.to_bytes() else: raise UnsupportedType(f"Expected <feat> is an exkaldi feature object but got: {type_name(feat)}.") assert isinstance(HCLGFile, str), "<HCLGFile> should be a file path." if not os.path.isfile(HCLGFile): raise WrongPath(f"No such file:{HCLGFile}") if maxThreads > 1: kaldiTool = f"gmm-latgen-faster-parallel --num-threads={maxThreads} " else: kaldiTool = "gmm-latgen-faster " kaldiTool += f'--allow-partial=true ' kaldiTool += f'--min-active={minActive} ' kaldiTool += f'--max-active={maxActive} ' kaldiTool += f'--max_mem={maxMem} ' kaldiTool += f'--beam={beam} ' kaldiTool += f'--lattice-beam={latBeam} ' kaldiTool += f'--acoustic-scale={acwt} ' wordsTemp = tempfile.NamedTemporaryFile("w+", suffix="_words.txt", encoding="utf-8") modelTemp = tempfile.NamedTemporaryFile("wb+", suffix=".mdl") try: if type_name(wordSymbolTable) == "LexiconBank": wordSymbolTable.dump_dict("words", wordsTemp) wordsFile = wordsTemp.name elif type_name(wordSymbolTable) == "ListTable": wordSymbolTable.save(wordsTemp) wordsTemp.seek(0) wordsFile = wordsTemp.name elif isinstance(wordSymbolTable, str): if not os.path.isfile(wordSymbolTable): raise WrongPath(f"No such file:{wordSymbolTable}.") else: wordsFile = wordSymbolTable else: raise UnsupportedType(f"<wordSymbolTable> should be a file path or exkaldi LexiconBank object but got {type_name(wordSymbolTable)}.") kaldiTool += f'--word-symbol-table={wordsFile} ' if config is not None: if check_config(name='gmm_decode', config=config): for key,value in config.items(): if isinstance(value, bool): if value is True: kaldiTool += f"{key} " else: kaldiTool += f" {key}={value}" if type_name(hmm) in ["MonophoneHMM", "TriphoneHMM"]: modelTemp.write(hmm.data) modelTemp.seek(0) hmmFile = modelTemp.name elif isinstance(hmm, str): if not os.path.isfile(hmm): raise WrongPath(f"No such file:{hmm}.") else: hmmFile = hmm else: raise UnsupportedType(f"<hmm> should be exkaldi HMM object or file path but got {type_name(hmm)}.") cmd = f'{kaldiTool} {hmmFile} {HCLGFile} ark:- ark:-' out, err, cod = run_shell_command(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, inputs=feat.data) if cod != 0 or out == b'': print(err.decode()) raise KaldiProcessError('Failed to generate lattice.') else: newName = f"lat({feat.name})" return Lattice(data=out, name=newName) finally: wordsTemp.close() modelTemp.close()
def train_ngrams_srilm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. If you don't specified the discount by the <config> option, We defaultly use "kndiscount". Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_srilm") function to get a reference of extra configurations. Also you can run shell command "ngram-count" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) # verify the max order declare.less_equal("order", order, "max order", 9) # prepare srilm tool ExkaldiInfo.prepare_srilm() with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name unkSymbol = lexicons("oov") wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) extraConfig = " " specifyDiscount = False if config is not None: if check_config(name='train_ngrams_srilm', config=config): for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " if key.endswith("discount"): specifyDiscount = True else: extraConfig += f" {key} {value}" cmd = f'ngram-count -text {text} -order {order} -limit-vocab -vocab {wordlistTemp.name} -unk -map-unk "{unkSymbol}" ' if specifyDiscount is False: cmd += "-kndiscount " cmd += "-interpolate " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) cmd += f" -lm {outFile}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0: print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KaldiProcessError( f'Failed to generate N-Grams language model.') return outFile
def train_ngrams_kenlm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_kenlm") function to get a reference of extra configurations. Also you can run shell command "lmplz" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) declare.less_equal("order", order, "max order", 9) with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name extraConfig = " " if config is not None: if check_config(name='train_ngrams_kenlm', config=config): if "--temp_prefix" in config.keys() and "-T" in config.keys(): raise WrongOperation( f'"--temp_prefix" and "-T" is the same configuration so only one of them is expected.' ) if "--memory" in config.keys() and "-S" in config.keys(): raise WrongOperation( f'"--memory" and "-S" is the same configuration so only one of them is expected.' ) for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " else: extraConfig += f"{key} {value} " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words_count = math.ceil(len(words) / 10) * 10 words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz") cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {text} --arpa {outFile} --limit_vocab_file {wordlistTemp.name}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KenlmProcessError("Failed to generate arpa file.") return outFile
def train_ngrams_srilm(lexicons, order, textFile, outFile, config=None): ''' Train n-grams language model with Srilm tookit. Args: <lexicons>: words.txt file path or Exkaldi LexiconBank object. <order>: the maxinum order of n-grams. <textFile>: text corpus file. <outFile>: ARPA out file name. <config>: configures, a Python dict object. You can use .check_config("train_ngrams_srilm") function to get configure information that you can set. Also you can run shell command "lmplz" to look their meaning. ''' assert isinstance( order, int ) and order > 0 and order < 10, "Expected <n> is a positive int value and it must be smaller than 10." assert isinstance(textFile, str), "Expected <textFile> is name-like string." assert isinstance(outFile, str), "Expected <outFile> is name-like string." assert type_name( lexicons ) == "LexiconBank", f"Expected <lexicons> is exkaldi LexiconBank object but got {type_name(lexicons)}." ExkaldiInfo.prepare_srilm() if not os.path.isfile(textFile): raise WrongPath(f"No such file:{textFile}") else: ## Should check the numbers of lines cmd = f"shuf {textFile} -n 100" out, err, cod = run_shell_command(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError("Failed to sample from text file.") elif out == b'': raise WrongDataFormat("Void text file.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or extremely short." ) wordlist = tempfile.NamedTemporaryFile("w+", encoding='utf-8', suffix=".txt") unkSymbol = lexicons("oov") try: lexiconp = lexicons("lexiconp") words = [x[0] for x in lexiconp.keys()] wordlist.write("\n".join(words)) wordlist.seek(0) #cmd2 = f"ngram-count -text {textFile} -order {order}" extraConfig = " " specifyDiscount = False if config is not None: if check_config(name='train_ngrams_srilm', config=config): for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " if key.endswith("discount"): specifyDiscount = True else: extraConfig += f" {key} {value}" cmd = f"ngram-count -text {textFile} -order {order} -limit-vocab -vocab {wordlist.name} -unk -map-unk {unkSymbol} " if specifyDiscount is False: cmd += "-kndiscount " cmd += "-interpolate " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) cmd += f" -lm {outFile}" out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0: print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KaldiProcessError( f'Failed to generate ngrams language model.') else: return os.path.abspath(outFile) finally: wordlist.close()
def train_ngrams_kenlm(lexicons, order, textFile, outFile, config=None): ''' Train n-grams language model with KenLm tookit. Args: <lexicons>: words.txt file path or Exkaldi LexiconBank object. <order>: the maxinum order of n-grams. <textFile>: text corpus file. <outFile>: ARPA out file name. <config>: configures, a Python dict object. You can use .check_config("train_ngrams_kenlm") function to get configure information that you can set. Also you can run shell command "lmplz" to look their meaning. ''' assert isinstance( order, int ) and 0 < order <= 6, "We support maximum 6-grams LM in current version." if not os.path.isfile(textFile): raise WrongPath("No such file:{}".format(textFile)) else: ## Should check the numbers of lines cmd = f"shuf {textFile} -n 100" out, err, cod = run_shell_command(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError("Failed to sample from text file.") elif out == b'': raise WrongDataFormat("Void text file.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or extremely short." ) extraConfig = " " if config != None: assert isinstance( config, dict ), f"<config> should be dict object but got: {type_name(config)}." if check_config(name='train_ngrams_kenlm', config=config): if "--temp_prefix" in config.keys() and "-T" in config.keys(): raise WrongOperation( f'"--temp_prefix" and "-T" is the same configure so only one of them is expected.' ) if "--memory" in config.keys() and "-S" in config.keys(): raise WrongOperation( f'"--memory" and "-S" is the same configure so only one of them is expected.' ) for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " else: extraConfig += f"{key} {value} " assert isinstance(outFile, str), f"<outFile> should be a string." if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) words = tempfile.NamedTemporaryFile("w+", suffix=".txt", encoding="utf-8") try: if type_name(lexicons) == "LexiconBank": ws = lexicons("words") words_count = math.ceil(len(ws) / 10) * 10 ws = "\n".join(ws.keys()) elif isinstance(lexicons, str): if not os.path.isfile(lexicons): raise WrongPath(f"No such file:{lexicons}.") with open(lexicons, "r", encoding="utf-8") as fr: lines = fr.readlines() ws = [] for line in lines: line = line.strip().split(maxsplit=1) if len(line) < 1: continue else: ws.append(line[0]) words_count = math.ceil(len(ws) / 10) * 10 ws = "\n".join(ws) else: raise UnsupportedType( "<lexicons> should be LexiconBank object or file path.") words.write(ws) words.seek(0) KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz") cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {textFile} --arpa {outFile} --limit_vocab_file {words.name}" out, err, cod = run_shell_command(cmd, stderr=subprocess.PIPE) if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) raise KenlmProcessError("Failed to generate arpa file.") else: return os.path.abspath(outFile) finally: words.close()