Example #1
0
def get_static_lexicon():
    """
    Add the phoneme and lemma entries for special TTS symbols

    :param bool include_punctuation:
    :return: the lexicon with special lemmas and phonemes
    :rtype: lexicon.Lexicon
    """
    lex = lexicon.Lexicon()

    lex.add_lemma(
        lexicon.Lemma(orth=["[space]"], phon=["[space]"])
    )
    lex.add_phoneme("[space]", variation="none")

    lex.add_lemma(
        lexicon.Lemma(orth=["[start]"], phon=["[start]"])
    )
    lex.add_phoneme("[start]", variation="none")

    lex.add_lemma(
        lexicon.Lemma(orth=["[end]"], phon=["[end]"])
    )
    lex.add_phoneme("[end]", variation="none")

    return lex
Example #2
0
def _get_special_lemma_lexicon(add_unknown_phoneme_and_mapping=True):
    """
    Generate the special lemmas for LibriSpeech

    Librispeech uses silence, sentence begin/end and unknown, but no other special tokens.

    :param bool add_unknown_phoneme_and_mapping: add [UNKNOWN] as phoneme, otherwise add only the lemma without it
    :return: the lexicon with special lemmas and phonemes
    :rtype: lexicon.Lexicon
    """
    lex = lexicon.Lexicon()
    lex.add_lemma(
        lexicon.Lemma(
            orth=["[SILENCE]", ""],
            phon=["[SILENCE]"],
            synt=[],
            special="silence",
            eval=[[]],
        )
    )
    lex.add_lemma(
        lexicon.Lemma(orth=["[SENTENCE-BEGIN]"], synt=["<s>"], special="sentence-begin")
    )
    lex.add_lemma(
        lexicon.Lemma(orth=["[SENTENCE-END]"], synt=["</s>"], special="sentence-end")
    )
    if add_unknown_phoneme_and_mapping:
        lex.add_lemma(
            lexicon.Lemma(
                orth=["[UNKNOWN]"],
                phon=["[UNKNOWN]"],
                synt=["<UNK>"],
                special="unknown",
            )
        )
    else:
        lex.add_lemma(
            lexicon.Lemma(
                orth=["[UNKNOWN]"],
                synt=["<UNK>"],
                special="unknown",
            )
        )

    lex.add_phoneme("[SILENCE]", variation="none")
    if add_unknown_phoneme_and_mapping:
        lex.add_phoneme("[UNKNOWN]", variation="none")
    return lex
Example #3
0
    def run(self):
        lex = lexicon.Lexicon()

        phonemes = set()
        seen_lemma = {}
        with uopen(self.text_file.get_path()) as f:
            for line in f:
                # splitting is taken from RASR
                # src/Tools/Bliss/blissLexiconLib.py#L185
                s = line.split(None, 1)
                orth = s[0].split("\\", 1)[0]
                phon_variants = [
                    tuple(p.split()) for p in s[1].split("\\") if p.strip()
                ]
                for phon_variant in phon_variants:
                    phonemes.update(phon_variant)
                phon = [" ".join(v) for v in phon_variants]
                if orth in seen_lemma:
                    lemma = seen_lemma[orth]
                    for p in phon:
                        if p not in lemma.phon:
                            lemma.phon.append(p)
                else:
                    lemma = lexicon.Lemma(orth=[orth], phon=phon)
                    seen_lemma[orth] = lemma
                    lex.add_lemma(lemma)

        for phoneme in sorted(phonemes):
            lex.add_phoneme(phoneme)

        write_xml(self.out_bliss_lexicon.get_path(), lex.to_xml())
Example #4
0
def get_static_lexicon():
    """
    Add the phoneme and lemma entries for special and punctuation

    :param bool include_punctuation:
    :return: the lexicon with special lemmas and phonemes
    :rtype: lexicon.Lexicon
    """
    lex = lexicon.Lexicon()

    lex.add_lemma(
        lexicon.Lemma(orth=["[space]", ""],
                      phon=["[space]"],
                      special="silence"))
    lex.add_phoneme("[space]", variation="none")

    lex.add_lemma(
        lexicon.Lemma(orth=["[start]"],
                      phon=["[start]"],
                      special="sentence-begin"))
    lex.add_phoneme("[start]", variation="none")

    lex.add_lemma(
        lexicon.Lemma(orth=["[end]"], phon=["[end]"], special="sentence-end"))
    lex.add_phoneme("[end]", variation="none")

    lex.add_lemma(lexicon.Lemma(orth=["."], phon=["[dot]"]))
    lex.add_phoneme("[dot]", variation="none")
    lex.add_lemma(lexicon.Lemma(orth=[","], phon=["[comma]"]))
    lex.add_phoneme("[comma]", variation="none")
    lex.add_lemma(lexicon.Lemma(orth=["?"], phon=["[question_mark]"]))
    lex.add_phoneme("[question_mark]", variation="none")
    lex.add_lemma(lexicon.Lemma(orth=["!"], phon=["[exclamation_mark]"]))
    lex.add_phoneme("[exclamation_mark]", variation="none")
    lex.add_lemma(lexicon.Lemma(orth=["-"], phon=["[hyphen]"]))
    lex.add_phoneme("[hyphen]", variation="none")
    lex.add_lemma(lexicon.Lemma(orth=['"'], phon=["[quotation]"]))
    lex.add_phoneme("[quotation]", variation="none")

    return lex
Example #5
0
    def run(self):
        with uopen(tk.uncached_path(self.word_list_file), "rt") as f:
            words = [l.strip() for l in f]

        phonemes = set()
        for w in words:
            phonemes.update(w)
        phonemes.discard(" ")  # just in case

        lex = lexicon.Lexicon()
        lex.add_phoneme("sil", variation="none")
        for p in sorted(phonemes):
            p = self.transforms.get(p, p)
            lex.add_phoneme(p, "context")
        if self.add_unknown:
            lex.add_phoneme("unk", "none")
        if self.add_noise:
            lex.add_phoneme("noise", "none")

        # TODO: figure out requirements on synt/eval element for differnt types of lemmata
        # silence lemma, needs synt/eval element with empty token sequence
        lex.add_lemma(
            lexicon.Lemma(
                orth=["[SILENCE]", ""],
                phon=["sil"],
                synt=[],
                special="silence",
                eval=[[]],
            )
        )
        # sentence border lemmata, needs no eval element
        lex.add_lemma(
            lexicon.Lemma(
                orth=["[SENTENCE_BEGIN]"], synt=["<s>"], special="sentence-begin"
            )
        )
        lex.add_lemma(
            lexicon.Lemma(
                orth=["[SENTENCE_END]"], synt=["</s>"], special="sentence-end"
            )
        )
        # unknown lemma, needs no synt/eval element
        if self.add_unknown:
            lex.add_lemma(
                lexicon.Lemma(orth=["[UNKNOWN]"], phon=["unk"], special="unknown")
            )
            # TODO: synt = ["<UNK>"] ???
        # noise lemma, needs empty synt token sequence but no eval element?
        if self.add_noise:
            lex.add_lemma(
                lexicon.Lemma(
                    orth=["[NOISE]"],
                    phon=["noise"],
                    synt=[],
                    special="unknown",
                )
            )

        for w in words:
            l = lexicon.Lemma()
            l.orth.append(w)
            l.phon.append(" " + " ".join(self.transforms.get(p, p) for p in w) + " ")
            lex.add_lemma(l)

        with uopen(self.out_bliss_lexicon.get_path(), "w") as lexicon_file:
            lexicon_file.write('<?xml version="1.0" encoding="utf-8"?>\n')
            lexicon_file.write(ET.tostring(lex.to_xml(), "unicode"))
Example #6
0
def get_special_lemma_lexicon():
    """
    Generate the special phonemes/lemmas for Switchboard

    :rtype lexicon.Lexicon
    """
    lex = lexicon.Lexicon()

    tags = ["[SILENCE]", "[NOISE]", "[VOCALIZED-NOISE]", "[LAUGHTER]"]
    tag_to_phon = {
        "[SILENCE]": "[SILENCE]",
        "[NOISE]": "[NOISE]",
        "[VOCALIZED-NOISE]": "[VOCALIZEDNOISE]",
        "[LAUGHTER]": "[LAUGHTER]",
    }
    for tag in tags:
        lex.add_phoneme(tag_to_phon[tag], variation="none")

    # add non-special lemmas
    for tag in tags[1:]:  # silence is considered below
        lex.add_lemma(
            lexicon.Lemma(
                orth=[tag],
                phon=[tag_to_phon[tag]],
                synt=[],
                eval=[[]],
            ))

    # create special lemmas
    lex.add_lemma(
        lexicon.Lemma(orth=["[SENTENCE-END]"],
                      synt=["</s>"],
                      special="sentence-boundary"))

    lex.add_lemma(
        lexicon.Lemma(
            orth=["[sentence-begin]"],
            synt=["<s>"],
            eval=[[]],
            special="sentence-begin",
        ))

    lex.add_lemma(
        lexicon.Lemma(orth=["[sentence-end]"],
                      synt=["</s>"],
                      eval=[[]],
                      special="sentence-end"))

    lex.add_lemma(
        lexicon.Lemma(
            orth=["[SILENCE]", ""],
            phon=["[SILENCE]"],
            synt=[],
            eval=[[]],
            special="silence",
        ))

    lex.add_lemma(
        lexicon.Lemma(orth=["[UNKNOWN]"],
                      synt=["<unk>"],
                      eval=[[]],
                      special="unknown"))

    return lex