def get_static_lexicon(): """ Add the phoneme and lemma entries for special TTS symbols :param bool include_punctuation: :return: the lexicon with special lemmas and phonemes :rtype: lexicon.Lexicon """ lex = lexicon.Lexicon() lex.add_lemma( lexicon.Lemma(orth=["[space]"], phon=["[space]"]) ) lex.add_phoneme("[space]", variation="none") lex.add_lemma( lexicon.Lemma(orth=["[start]"], phon=["[start]"]) ) lex.add_phoneme("[start]", variation="none") lex.add_lemma( lexicon.Lemma(orth=["[end]"], phon=["[end]"]) ) lex.add_phoneme("[end]", variation="none") return lex
def _get_special_lemma_lexicon(add_unknown_phoneme_and_mapping=True): """ Generate the special lemmas for LibriSpeech Librispeech uses silence, sentence begin/end and unknown, but no other special tokens. :param bool add_unknown_phoneme_and_mapping: add [UNKNOWN] as phoneme, otherwise add only the lemma without it :return: the lexicon with special lemmas and phonemes :rtype: lexicon.Lexicon """ lex = lexicon.Lexicon() lex.add_lemma( lexicon.Lemma( orth=["[SILENCE]", ""], phon=["[SILENCE]"], synt=[], special="silence", eval=[[]], ) ) lex.add_lemma( lexicon.Lemma(orth=["[SENTENCE-BEGIN]"], synt=["<s>"], special="sentence-begin") ) lex.add_lemma( lexicon.Lemma(orth=["[SENTENCE-END]"], synt=["</s>"], special="sentence-end") ) if add_unknown_phoneme_and_mapping: lex.add_lemma( lexicon.Lemma( orth=["[UNKNOWN]"], phon=["[UNKNOWN]"], synt=["<UNK>"], special="unknown", ) ) else: lex.add_lemma( lexicon.Lemma( orth=["[UNKNOWN]"], synt=["<UNK>"], special="unknown", ) ) lex.add_phoneme("[SILENCE]", variation="none") if add_unknown_phoneme_and_mapping: lex.add_phoneme("[UNKNOWN]", variation="none") return lex
def run(self): lex = lexicon.Lexicon() phonemes = set() seen_lemma = {} with uopen(self.text_file.get_path()) as f: for line in f: # splitting is taken from RASR # src/Tools/Bliss/blissLexiconLib.py#L185 s = line.split(None, 1) orth = s[0].split("\\", 1)[0] phon_variants = [ tuple(p.split()) for p in s[1].split("\\") if p.strip() ] for phon_variant in phon_variants: phonemes.update(phon_variant) phon = [" ".join(v) for v in phon_variants] if orth in seen_lemma: lemma = seen_lemma[orth] for p in phon: if p not in lemma.phon: lemma.phon.append(p) else: lemma = lexicon.Lemma(orth=[orth], phon=phon) seen_lemma[orth] = lemma lex.add_lemma(lemma) for phoneme in sorted(phonemes): lex.add_phoneme(phoneme) write_xml(self.out_bliss_lexicon.get_path(), lex.to_xml())
def get_static_lexicon(): """ Add the phoneme and lemma entries for special and punctuation :param bool include_punctuation: :return: the lexicon with special lemmas and phonemes :rtype: lexicon.Lexicon """ lex = lexicon.Lexicon() lex.add_lemma( lexicon.Lemma(orth=["[space]", ""], phon=["[space]"], special="silence")) lex.add_phoneme("[space]", variation="none") lex.add_lemma( lexicon.Lemma(orth=["[start]"], phon=["[start]"], special="sentence-begin")) lex.add_phoneme("[start]", variation="none") lex.add_lemma( lexicon.Lemma(orth=["[end]"], phon=["[end]"], special="sentence-end")) lex.add_phoneme("[end]", variation="none") lex.add_lemma(lexicon.Lemma(orth=["."], phon=["[dot]"])) lex.add_phoneme("[dot]", variation="none") lex.add_lemma(lexicon.Lemma(orth=[","], phon=["[comma]"])) lex.add_phoneme("[comma]", variation="none") lex.add_lemma(lexicon.Lemma(orth=["?"], phon=["[question_mark]"])) lex.add_phoneme("[question_mark]", variation="none") lex.add_lemma(lexicon.Lemma(orth=["!"], phon=["[exclamation_mark]"])) lex.add_phoneme("[exclamation_mark]", variation="none") lex.add_lemma(lexicon.Lemma(orth=["-"], phon=["[hyphen]"])) lex.add_phoneme("[hyphen]", variation="none") lex.add_lemma(lexicon.Lemma(orth=['"'], phon=["[quotation]"])) lex.add_phoneme("[quotation]", variation="none") return lex
def run(self): with uopen(tk.uncached_path(self.word_list_file), "rt") as f: words = [l.strip() for l in f] phonemes = set() for w in words: phonemes.update(w) phonemes.discard(" ") # just in case lex = lexicon.Lexicon() lex.add_phoneme("sil", variation="none") for p in sorted(phonemes): p = self.transforms.get(p, p) lex.add_phoneme(p, "context") if self.add_unknown: lex.add_phoneme("unk", "none") if self.add_noise: lex.add_phoneme("noise", "none") # TODO: figure out requirements on synt/eval element for differnt types of lemmata # silence lemma, needs synt/eval element with empty token sequence lex.add_lemma( lexicon.Lemma( orth=["[SILENCE]", ""], phon=["sil"], synt=[], special="silence", eval=[[]], ) ) # sentence border lemmata, needs no eval element lex.add_lemma( lexicon.Lemma( orth=["[SENTENCE_BEGIN]"], synt=["<s>"], special="sentence-begin" ) ) lex.add_lemma( lexicon.Lemma( orth=["[SENTENCE_END]"], synt=["</s>"], special="sentence-end" ) ) # unknown lemma, needs no synt/eval element if self.add_unknown: lex.add_lemma( lexicon.Lemma(orth=["[UNKNOWN]"], phon=["unk"], special="unknown") ) # TODO: synt = ["<UNK>"] ??? # noise lemma, needs empty synt token sequence but no eval element? if self.add_noise: lex.add_lemma( lexicon.Lemma( orth=["[NOISE]"], phon=["noise"], synt=[], special="unknown", ) ) for w in words: l = lexicon.Lemma() l.orth.append(w) l.phon.append(" " + " ".join(self.transforms.get(p, p) for p in w) + " ") lex.add_lemma(l) with uopen(self.out_bliss_lexicon.get_path(), "w") as lexicon_file: lexicon_file.write('<?xml version="1.0" encoding="utf-8"?>\n') lexicon_file.write(ET.tostring(lex.to_xml(), "unicode"))
def get_special_lemma_lexicon(): """ Generate the special phonemes/lemmas for Switchboard :rtype lexicon.Lexicon """ lex = lexicon.Lexicon() tags = ["[SILENCE]", "[NOISE]", "[VOCALIZED-NOISE]", "[LAUGHTER]"] tag_to_phon = { "[SILENCE]": "[SILENCE]", "[NOISE]": "[NOISE]", "[VOCALIZED-NOISE]": "[VOCALIZEDNOISE]", "[LAUGHTER]": "[LAUGHTER]", } for tag in tags: lex.add_phoneme(tag_to_phon[tag], variation="none") # add non-special lemmas for tag in tags[1:]: # silence is considered below lex.add_lemma( lexicon.Lemma( orth=[tag], phon=[tag_to_phon[tag]], synt=[], eval=[[]], )) # create special lemmas lex.add_lemma( lexicon.Lemma(orth=["[SENTENCE-END]"], synt=["</s>"], special="sentence-boundary")) lex.add_lemma( lexicon.Lemma( orth=["[sentence-begin]"], synt=["<s>"], eval=[[]], special="sentence-begin", )) lex.add_lemma( lexicon.Lemma(orth=["[sentence-end]"], synt=["</s>"], eval=[[]], special="sentence-end")) lex.add_lemma( lexicon.Lemma( orth=["[SILENCE]", ""], phon=["[SILENCE]"], synt=[], eval=[[]], special="silence", )) lex.add_lemma( lexicon.Lemma(orth=["[UNKNOWN]"], synt=["<unk>"], eval=[[]], special="unknown")) return lex