Esempio n. 1
0
    def run(self):
        c = corpus.Corpus()
        c.load(self.bliss_corpus.get_path())

        lex = lexicon.Lexicon()
        lex.load(self.bliss_lexicon.get_path())

        # build lookup dict
        lookup_dict = {}
        for lemma in lex.lemmata:
            for orth in lemma.orth:
                if orth and self.strategy == LexiconStrategy.PICK_FIRST:
                    if len(lemma.phon) > 0:
                        lookup_dict[orth] = lemma.phon[0]

        word_separation_phon = lookup_dict[self.word_separation_orth]
        print("using word separation symbold: %s" % word_separation_phon)
        separator = " %s " % word_separation_phon

        for segment in c.segments():
            try:
                words = [lookup_dict[w] for w in segment.orth.split(" ")]
                segment.orth = separator.join(words)
            except LookupError:
                raise LookupError(
                    "Out-of-vocabulary word detected, please make sure that there are no OOVs remaining by e.g. applying G2P"
                )

        c.dump(self.out_corpus.get_path())
Esempio n. 2
0
    def run(self):
        lex = lexicon.Lexicon()
        if self.sort_phonemes:
            sorted_phoneme_list = [
                (k, self.static_lexicon.phonemes[k])
                for k in sorted(self.static_lexicon.phonemes.keys())
            ]
            for phoneme_tuple in sorted_phoneme_list:
                lex.add_phoneme(symbol=phoneme_tuple[0],
                                variation=phoneme_tuple[1])
        else:
            lex.phonemes = self.static_lexicon.phonemes

        if self.sort_lemmata:
            lemma_dict = {}
            for lemma in self.static_lexicon.lemmata:
                # sort by first orth entry
                lemma_dict[lemma.orth[0]] = lemma
            lex.lemmata = [
                lemma_dict[key] for key in sorted(lemma_dict.keys())
            ]
        else:
            lex.lemmata = self.static_lexicon.lemmata

        write_xml(self.out_bliss_lexicon.get_path(), lex.to_xml())
Esempio n. 3
0
    def run(self):
        lex = lexicon.Lexicon()

        phonemes = set()
        seen_lemma = {}
        with uopen(self.text_file.get_path()) as f:
            for line in f:
                # splitting is taken from RASR
                # src/Tools/Bliss/blissLexiconLib.py#L185
                s = line.split(None, 1)
                orth = s[0].split("\\", 1)[0]
                phon_variants = [
                    tuple(p.split()) for p in s[1].split("\\") if p.strip()
                ]
                for phon_variant in phon_variants:
                    phonemes.update(phon_variant)
                phon = [" ".join(v) for v in phon_variants]
                if orth in seen_lemma:
                    lemma = seen_lemma[orth]
                    for p in phon:
                        if p not in lemma.phon:
                            lemma.phon.append(p)
                else:
                    lemma = lexicon.Lemma(orth=[orth], phon=phon)
                    seen_lemma[orth] = lemma
                    lex.add_lemma(lemma)

        for phoneme in sorted(phonemes):
            lex.add_phoneme(phoneme)

        write_xml(self.out_bliss_lexicon.get_path(), lex.to_xml())
Esempio n. 4
0
    def run(self):
        lex = lexicon.Lexicon()
        lex.load(self.bliss_lexicon.get_path())

        orth2lemmata = collections.defaultdict(list)

        for lemma in lex.lemmata:
            if lemma.special:
                continue
            num_orths = len(lemma.orth)
            if num_orths < 1:
                continue
            if num_orths > 1 and not self.merge_multi_orths_lemmata:
                continue
            orth2lemmata[lemma.orth[0]].append(lemma)

        for orth, lemmata in orth2lemmata.items():
            if len(lemmata) < 2:
                continue
            final_lemma = lemmata[0]
            for lemma in lemmata[1:]:
                for orth in lemma.orth:
                    if orth not in final_lemma.orth:
                        final_lemma.orth.append(orth)
                for phon in lemma.phon:
                    if phon not in final_lemma.phon:
                        final_lemma.phon.append(phon)
                if final_lemma.synt is None and lemma.synt is not None:
                    final_lemma.synt = lemma.synt
                for eval in lemma.eval:
                    if eval not in final_lemma.eval:
                        final_lemma.eval.append(eval)
                lex.lemmata.remove(lemma)

        write_xml(self.out_bliss_lexicon, element_tree=lex.to_xml())
Esempio n. 5
0
def get_static_lexicon():
    """
    Add the phoneme and lemma entries for special TTS symbols

    :param bool include_punctuation:
    :return: the lexicon with special lemmas and phonemes
    :rtype: lexicon.Lexicon
    """
    lex = lexicon.Lexicon()

    lex.add_lemma(
        lexicon.Lemma(orth=["[space]"], phon=["[space]"])
    )
    lex.add_phoneme("[space]", variation="none")

    lex.add_lemma(
        lexicon.Lemma(orth=["[start]"], phon=["[start]"])
    )
    lex.add_phoneme("[start]", variation="none")

    lex.add_lemma(
        lexicon.Lemma(orth=["[end]"], phon=["[end]"])
    )
    lex.add_phoneme("[end]", variation="none")

    return lex
Esempio n. 6
0
    def run(self):
        merged_lex = lexicon.Lexicon()

        lexica = []
        for lexicon_path in self.lexica:
            lex = lexicon.Lexicon()
            lex.load(lexicon_path.get_path())
            lexica.append(lex)

        # combine the phonemes
        merged_phonemes = OrderedDict()
        for lex in lexica:
            for symbol, variation in lex.phonemes.items():
                if symbol in merged_phonemes.keys():
                    assert variation == merged_phonemes[symbol], (
                        "conflicting phoneme variant for phoneme: %s" % symbol)
                else:
                    merged_phonemes[symbol] = variation

        if self.sort_phonemes:
            sorted_phoneme_list = [(k, merged_phonemes[k])
                                   for k in sorted(merged_phonemes.keys())]
            for phoneme_tuple in sorted_phoneme_list:
                merged_lex.add_phoneme(symbol=phoneme_tuple[0],
                                       variation=phoneme_tuple[1])
        else:
            merged_lex.phonemes = merged_phonemes

        # combine the lemmata
        if self.sort_lemmata:
            lemma_dict = defaultdict(list)
            for lex in lexica:
                for lemma in lex.lemmata:
                    # sort by first orth entry
                    orth_key = lemma.orth[0] if lemma.orth else ""
                    lemma_dict[orth_key].append(lemma)
            merged_lex.lemmata = list(
                itertools.chain(
                    *[lemma_dict[key] for key in sorted(lemma_dict.keys())]))
        else:
            for lex in lexica:
                # check for existing orths to avoid overlap
                merged_lex.lemmata.extend(lex.lemmata)

        write_xml(self.out_bliss_lexicon.get_path(), merged_lex.to_xml())
Esempio n. 7
0
    def run(self):
        lex = lexicon.Lexicon()
        lex.load(self.bliss_lexicon.get_path())

        vocab = {k: v for v, k in enumerate(lex.phonemes.keys())}
        pickle.dump(vocab, uopen(self.out_vocab, "wb"))

        print("Vocab Size: %i" % len(lex.phonemes))
        self.out_vocab_size.set(len(lex.phonemes))
Esempio n. 8
0
def _get_special_lemma_lexicon(add_unknown_phoneme_and_mapping=True):
    """
    Generate the special lemmas for LibriSpeech

    Librispeech uses silence, sentence begin/end and unknown, but no other special tokens.

    :param bool add_unknown_phoneme_and_mapping: add [UNKNOWN] as phoneme, otherwise add only the lemma without it
    :return: the lexicon with special lemmas and phonemes
    :rtype: lexicon.Lexicon
    """
    lex = lexicon.Lexicon()
    lex.add_lemma(
        lexicon.Lemma(
            orth=["[SILENCE]", ""],
            phon=["[SILENCE]"],
            synt=[],
            special="silence",
            eval=[[]],
        )
    )
    lex.add_lemma(
        lexicon.Lemma(orth=["[SENTENCE-BEGIN]"], synt=["<s>"], special="sentence-begin")
    )
    lex.add_lemma(
        lexicon.Lemma(orth=["[SENTENCE-END]"], synt=["</s>"], special="sentence-end")
    )
    if add_unknown_phoneme_and_mapping:
        lex.add_lemma(
            lexicon.Lemma(
                orth=["[UNKNOWN]"],
                phon=["[UNKNOWN]"],
                synt=["<UNK>"],
                special="unknown",
            )
        )
    else:
        lex.add_lemma(
            lexicon.Lemma(
                orth=["[UNKNOWN]"],
                synt=["<UNK>"],
                special="unknown",
            )
        )

    lex.add_phoneme("[SILENCE]", variation="none")
    if add_unknown_phoneme_and_mapping:
        lex.add_phoneme("[UNKNOWN]", variation="none")
    return lex
Esempio n. 9
0
    def _fix_hash_for_lexicon(cls, new_lexicon):
        """
        The "old" lexicon had an incorrect "synt" type, after fixing
        the hashes for the lexicon changed, so this job here
        needs to revert the lexicon to the old "synt" type.

        :param lexicon.Lexicon new_lexicon:
        :return: lexicon in the legacy format
        :type: lexicon.Lexicon
        """
        lex = lexicon.Lexicon()
        lex.phonemes = new_lexicon.phonemes
        lex.lemmata = []
        for new_lemma in new_lexicon.lemmata:
            lemma = copy.deepcopy(new_lemma)
            lemma.synt = [new_lemma.synt] if new_lemma.synt is not None else []
            lex.lemmata.append(lemma)

        return lex
Esempio n. 10
0
def get_static_lexicon():
    """
    Add the phoneme and lemma entries for special and punctuation

    :param bool include_punctuation:
    :return: the lexicon with special lemmas and phonemes
    :rtype: lexicon.Lexicon
    """
    lex = lexicon.Lexicon()

    lex.add_lemma(
        lexicon.Lemma(orth=["[space]", ""],
                      phon=["[space]"],
                      special="silence"))
    lex.add_phoneme("[space]", variation="none")

    lex.add_lemma(
        lexicon.Lemma(orth=["[start]"],
                      phon=["[start]"],
                      special="sentence-begin"))
    lex.add_phoneme("[start]", variation="none")

    lex.add_lemma(
        lexicon.Lemma(orth=["[end]"], phon=["[end]"], special="sentence-end"))
    lex.add_phoneme("[end]", variation="none")

    lex.add_lemma(lexicon.Lemma(orth=["."], phon=["[dot]"]))
    lex.add_phoneme("[dot]", variation="none")
    lex.add_lemma(lexicon.Lemma(orth=[","], phon=["[comma]"]))
    lex.add_phoneme("[comma]", variation="none")
    lex.add_lemma(lexicon.Lemma(orth=["?"], phon=["[question_mark]"]))
    lex.add_phoneme("[question_mark]", variation="none")
    lex.add_lemma(lexicon.Lemma(orth=["!"], phon=["[exclamation_mark]"]))
    lex.add_phoneme("[exclamation_mark]", variation="none")
    lex.add_lemma(lexicon.Lemma(orth=["-"], phon=["[hyphen]"]))
    lex.add_phoneme("[hyphen]", variation="none")
    lex.add_lemma(lexicon.Lemma(orth=['"'], phon=["[quotation]"]))
    lex.add_phoneme("[quotation]", variation="none")

    return lex
Esempio n. 11
0
    def run(self):
        with uopen(tk.uncached_path(self.word_list_file), "rt") as f:
            words = [l.strip() for l in f]

        phonemes = set()
        for w in words:
            phonemes.update(w)
        phonemes.discard(" ")  # just in case

        lex = lexicon.Lexicon()
        lex.add_phoneme("sil", variation="none")
        for p in sorted(phonemes):
            p = self.transforms.get(p, p)
            lex.add_phoneme(p, "context")
        if self.add_unknown:
            lex.add_phoneme("unk", "none")
        if self.add_noise:
            lex.add_phoneme("noise", "none")

        # TODO: figure out requirements on synt/eval element for differnt types of lemmata
        # silence lemma, needs synt/eval element with empty token sequence
        lex.add_lemma(
            lexicon.Lemma(
                orth=["[SILENCE]", ""],
                phon=["sil"],
                synt=[],
                special="silence",
                eval=[[]],
            )
        )
        # sentence border lemmata, needs no eval element
        lex.add_lemma(
            lexicon.Lemma(
                orth=["[SENTENCE_BEGIN]"], synt=["<s>"], special="sentence-begin"
            )
        )
        lex.add_lemma(
            lexicon.Lemma(
                orth=["[SENTENCE_END]"], synt=["</s>"], special="sentence-end"
            )
        )
        # unknown lemma, needs no synt/eval element
        if self.add_unknown:
            lex.add_lemma(
                lexicon.Lemma(orth=["[UNKNOWN]"], phon=["unk"], special="unknown")
            )
            # TODO: synt = ["<UNK>"] ???
        # noise lemma, needs empty synt token sequence but no eval element?
        if self.add_noise:
            lex.add_lemma(
                lexicon.Lemma(
                    orth=["[NOISE]"],
                    phon=["noise"],
                    synt=[],
                    special="unknown",
                )
            )

        for w in words:
            l = lexicon.Lemma()
            l.orth.append(w)
            l.phon.append(" " + " ".join(self.transforms.get(p, p) for p in w) + " ")
            lex.add_lemma(l)

        with uopen(self.out_bliss_lexicon.get_path(), "w") as lexicon_file:
            lexicon_file.write('<?xml version="1.0" encoding="utf-8"?>\n')
            lexicon_file.write(ET.tostring(lex.to_xml(), "unicode"))
Esempio n. 12
0
def get_special_lemma_lexicon():
    """
    Generate the special phonemes/lemmas for Switchboard

    :rtype lexicon.Lexicon
    """
    lex = lexicon.Lexicon()

    tags = ["[SILENCE]", "[NOISE]", "[VOCALIZED-NOISE]", "[LAUGHTER]"]
    tag_to_phon = {
        "[SILENCE]": "[SILENCE]",
        "[NOISE]": "[NOISE]",
        "[VOCALIZED-NOISE]": "[VOCALIZEDNOISE]",
        "[LAUGHTER]": "[LAUGHTER]",
    }
    for tag in tags:
        lex.add_phoneme(tag_to_phon[tag], variation="none")

    # add non-special lemmas
    for tag in tags[1:]:  # silence is considered below
        lex.add_lemma(
            lexicon.Lemma(
                orth=[tag],
                phon=[tag_to_phon[tag]],
                synt=[],
                eval=[[]],
            ))

    # create special lemmas
    lex.add_lemma(
        lexicon.Lemma(orth=["[SENTENCE-END]"],
                      synt=["</s>"],
                      special="sentence-boundary"))

    lex.add_lemma(
        lexicon.Lemma(
            orth=["[sentence-begin]"],
            synt=["<s>"],
            eval=[[]],
            special="sentence-begin",
        ))

    lex.add_lemma(
        lexicon.Lemma(orth=["[sentence-end]"],
                      synt=["</s>"],
                      eval=[[]],
                      special="sentence-end"))

    lex.add_lemma(
        lexicon.Lemma(
            orth=["[SILENCE]", ""],
            phon=["[SILENCE]"],
            synt=[],
            eval=[[]],
            special="silence",
        ))

    lex.add_lemma(
        lexicon.Lemma(orth=["[UNKNOWN]"],
                      synt=["<unk>"],
                      eval=[[]],
                      special="unknown"))

    return lex