def arpa_to_binary(arpaFile, outFile): ''' Transform ARPA language model to KenLM binary format. Args: <arpaFile>: ARPA file path. <outFile>: output binary file path. Return: output file name with suffix ".binary". ''' declare.is_file("arpaFile", arpaFile) declare.is_valid_string("outFile", outFile) outFile = outFile.strip() if not outFile.endswith(".binary"): outFile += ".binary" declare.is_valid_file_name("outFile", outFile) make_dependent_dirs(outFile) cmd = os.path.join(sys.prefix, "exkaldisrc", "tools", "build_binary") cmd += f" -s {arpaFile} {outFile}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (cod != 0) or (not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KenlmProcessError("Failed to tansform ARPA to binary format.") else: return outFile
def utt2spk_to_spk2utt(utt2spk,outFile=None): ''' Transform utt2spk to spk2utt. Args: <utt2spk>: file name or exkaldi ListTable object. <outFile>: file name or None. Return: file name or exakldi ListTable object. ''' declare.is_potential_list_table("utt2spk",utt2spk) if outFile is not None: declare.is_valid_file_name(outFile) if isinstance(utt2spk,str): utt2spk = load_list_table(utt2spk) spk2utt = ListTable(name="spk2utt") for utt,spk in utt2spk.items(): declare.is_valid_string("utterance ID",utt) declare.is_valid_string("speaker ID",spk) assert utt.count(" ") == 0,f"<utterance ID> is not a continuous string but spaces existed: {utt}." assert spk.count(" ") == 0,f"<speaker ID> is not a continuous string but spaces existed: {spk}." try: spk2utt[spk] += f" {utt}" except KeyError: spk2utt[spk] = utt if outFile is None: return spk2utt else: spk2utt.save(outFile) return outFile
def save(self, fileName=None): ''' Save arguments to file with specified format. Args: _fileName_: Nonr, a resonable file name. Return: if fileName is None: return a string of all contents else: the saved file name ''' if fileName is not None: declare.is_valid_file_name("fileName", fileName) make_dependent_dirs(fileName, True) contents = [] contents.append(self.__discription) for name, info in self.__arguments.items(): # option name m = "\n" m += f"name={name}\n" # option value if isinstance(info.value,(list,tuple)): value="|".join(map(str,info.value)) else: value = info.value m += f"value={value}\n" # abbreviation and dtype m += f"abbr={self.__name2Abb[name]}\n" m += f"dtype={info.dtype.__name__}\n" # default if isinstance(info.default,(list,tuple)): default="|".join(map(str,info.default)) else: default = info.default m += f"default={default}\n" # choices if isinstance(info.choices,(list,tuple)): choices = "|".join(map(str,info.choices)) else: choices = info.choices m += f"choices={choices}\n" # boundary and discription m += f"minV={info.minV}\n" m += f"maxV={info.maxV}\n" m += f"discription={info.discription}" contents.append(m) contents = "\n".join(contents) + "\n" if fileName is not None: with open(fileName, "w", encoding="utf-8") as fw: fw.write(contents) return fileName else: return contents
def spk2utt_to_utt2spk(spk2utt,outFile=None): ''' Transform spk2utt file to utt2spk file. Args: <spk2utt>: file name or exkaldi ListTable object. <outFile>: file name or None. Return: file name or exakldi ListTable object. ''' declare.is_potential_list_table("spk2utt",spk2utt) if outFile is not None: declare.is_valid_file_name(outFile) if isinstance(spk2utt,str): spk2utt = load_list_table(spk2utt) utt2spk = ListTable(name="utt2spk") for spk,utts in spk2utt.items(): declare.is_valid_string("utterance IDs",utts) declare.is_valid_string("speaker ID",spk) assert spk.count(" ") == 0,f"<speaker ID> is not a continuous string but spaces existed: {spk}." for utt in utts.split(): try: utt2spk[utt] except KeyError: utt2spk[utt] = spk else: raise WrongDataFormat(f"utterance ID:{utt} has existed toward multiple speakers.") if outFile is None: return utt2spk else: utt2spk.save(outFile) return outFile
def train_ngrams_srilm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. If you don't specified the discount by the <config> option, We defaultly use "kndiscount". Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_srilm") function to get a reference of extra configurations. Also you can run shell command "ngram-count" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) # verify the max order declare.less_equal("order", order, "max order", 9) # prepare srilm tool ExkaldiInfo.prepare_srilm() with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name unkSymbol = lexicons("oov") wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) extraConfig = " " specifyDiscount = False if config is not None: if check_config(name='train_ngrams_srilm', config=config): for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " if key.endswith("discount"): specifyDiscount = True else: extraConfig += f" {key} {value}" cmd = f'ngram-count -text {text} -order {order} -limit-vocab -vocab {wordlistTemp.name} -unk -map-unk "{unkSymbol}" ' if specifyDiscount is False: cmd += "-kndiscount " cmd += "-interpolate " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) cmd += f" -lm {outFile}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or os.path.getsize(outFile) == 0: print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KaldiProcessError( f'Failed to generate N-Grams language model.') return outFile
def train_ngrams_kenlm(lexicons, order, text, outFile, config=None): ''' Train N-Grams language model with SriLM tookit. Args: <lexicons>: an exkaldi LexiconBank object. <order>: the maximum order of N-Grams. <text>: a text corpus file or an exkaldi transcription object. <outFile>: output file name of arpa LM. <config>: extra configurations, a Python dict object. You can use .check_config("train_ngrams_kenlm") function to get a reference of extra configurations. Also you can run shell command "lmplz" to look their usage. ''' declare.is_lexicon_bank("lexicons", lexicons) declare.is_positive_int("order", order) declare.is_potential_transcription("text", text) declare.is_valid_file_name("outFile", outFile) declare.less_equal("order", order, "max order", 9) with FileHandleManager() as fhm: # check whether this is a reasonable text corpus that should be splited by space. if isinstance(text, str): cmd = f"shuf {text} -n 100" out, err, cod = run_shell_command(cmd, stdout="PIPE", stderr="PIPE") if (isinstance(cod, int) and cod != 0): print(err.decode()) raise ShellProcessError( f"Failed to sample from text file:{text}.") elif out == b'': raise WrongDataFormat(f"Void text file:{text}.") else: out = out.decode().strip().split("\n") spaceCount = 0 for line in out: spaceCount += line.count(" ") if spaceCount < len(out) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) else: sampleText = text.subset(nRandom=100) spaceCount = 0 for key, value in sampleText.items(): assert isinstance( value, str ), f"Transcription must be string but got: {type_name(value)}." spaceCount += value.count(" ") if spaceCount < len(sampleText) // 2: raise WrongDataFormat( "The text file doesn't seem to be separated by spaces or sentences are extremely short." ) textTemp = fhm.create("a+", suffix=".txt", encoding="utf-8") text.save(textTemp, discardUttID=True) text = textTemp.name extraConfig = " " if config is not None: if check_config(name='train_ngrams_kenlm', config=config): if "--temp_prefix" in config.keys() and "-T" in config.keys(): raise WrongOperation( f'"--temp_prefix" and "-T" is the same configuration so only one of them is expected.' ) if "--memory" in config.keys() and "-S" in config.keys(): raise WrongOperation( f'"--memory" and "-S" is the same configuration so only one of them is expected.' ) for key, value in config.items(): if isinstance(value, bool): if value is True: extraConfig += f"{key} " else: extraConfig += f"{key} {value} " if not outFile.rstrip().endswith(".arpa"): outFile += ".arpa" make_dependent_dirs(outFile, pathIsFile=True) wordlistTemp = fhm.create("w+", encoding='utf-8', suffix=".txt") words = lexicons("words") words_count = math.ceil(len(words) / 10) * 10 words = "\n".join(words.keys()) wordlistTemp.write(words) wordlistTemp.seek(0) KenLMTool = os.path.join(sys.prefix, "exkaldisrc", "tools", "lmplz") cmd = f"{KenLMTool}{extraConfig}-o {order} --vocab_estimate {words_count} --text {text} --arpa {outFile} --limit_vocab_file {wordlistTemp.name}" out, err, cod = run_shell_command(cmd, stderr="PIPE") if (isinstance(cod, int) and cod != 0) or ( not os.path.isfile(outFile)) or (os.path.getsize(outFile) == 0): print(err.decode()) if os.path.isfile(outFile): os.remove(outFile) raise KenlmProcessError("Failed to generate arpa file.") return outFile
def check_multiple_resources(*resources,outFile=None): ''' This function is used to check whether or not use multiple process and verify the resources. args: <resources>: objects. <outFile>: None,file name,or a list of None objects,file names. If None,it means standard output stream. Return: lists of resources. ''' # check the number of parallels multipleFlag = [ len(re) if isinstance(re,(list,tuple)) else 1 for re in resources ] multipleFlag = list(set(multipleFlag)) if len(multipleFlag) == 0: raise WrongOperation(f"No any resource has been received.") elif len(multipleFlag) > 2: raise WrongOperation(f"The number of resources has various sizes:{multipleFlag}. We hope they have the same amount if their size are not 1.") multipleFlag = max(multipleFlag) # check and modify the amount of each resource resources = list(resources) for index,target in enumerate(resources): if isinstance(target,(list,tuple)): if len(target) == 1: resources[index] = [ target[0] for i in range(multipleFlag) ] else: exType = None for t in target: if exType is None: exType = type_name(t) elif type_name(t) != exType: raise WrongDataFormat(f"Elements of one group should be the same data class,but got: {exType} != {type_name(t)}.") else: resources[index] = [ target for i in range(multipleFlag) ] # check output file format if multipleFlag > 1: assert outFile is not None,"When apply parallel processes,output file name is necessary." outFiles = [] declare.is_classes("outFile",outFile,[str,list,tuple]) if isinstance(outFile,str): declare.is_valid_file_name("outFile",outFile) outFile = os.path.abspath(outFile) dirName = os.path.dirname(outFile) fileName = os.path.basename(outFile) namePattern = f"nj%0{len(str(multipleFlag))}d_{fileName}" outFiles = [ os.path.join(dirName,namePattern%i) for i in range(multipleFlag) ] else: declare.equal("the number of output files",len(outFile),"the number of parallel processes",multipleFlag) outFiles = [] for f in outFile: declare.is_valid_file_name("outFile",f) outFiles.append(f) resources.append(outFiles) else: if outFile is None: outFile = "-" else: declare.is_valid_file_name("outFile",outFile) resources.append([outFile,]) return resources