コード例 #1
0
    def run(self):

        in_lexicon = Lexicon()
        in_lexicon.load(self.bliss_lexicon.get_path())

        out_lexicon = Lexicon()
        out_lexicon.lemmata = in_lexicon.lemmata

        silence_phon = None
        for lemma in out_lexicon.lemmata:
            if lemma.special == "silence":
                if self.delete_empty_orth:
                    orths = []
                    for orth in lemma.orth:
                        if orth == "":
                            continue
                        orths.append(orth)
                    lemma.orth = orths
                silence_phon = lemma.phon[0]
                assert len(lemma.phon) == 1, (
                    "Silence lemma does not have only one phoneme"
                )
        assert silence_phon, (
            "No silence lemma found"
        )

        out_lexicon.add_phoneme(silence_phon, in_lexicon.phonemes[silence_phon])

        for phoneme, variation in in_lexicon.phonemes.items():
            if phoneme == silence_phon:
                continue
            out_lexicon.add_phoneme(phoneme, variation)

        write_xml(self.out_lexicon.get_path(), out_lexicon.to_xml())
コード例 #2
0
    def run(self):
        lex = lexicon.Lexicon()
        if self.sort_phonemes:
            sorted_phoneme_list = [
                (k, self.static_lexicon.phonemes[k])
                for k in sorted(self.static_lexicon.phonemes.keys())
            ]
            for phoneme_tuple in sorted_phoneme_list:
                lex.add_phoneme(symbol=phoneme_tuple[0],
                                variation=phoneme_tuple[1])
        else:
            lex.phonemes = self.static_lexicon.phonemes

        if self.sort_lemmata:
            lemma_dict = {}
            for lemma in self.static_lexicon.lemmata:
                # sort by first orth entry
                lemma_dict[lemma.orth[0]] = lemma
            lex.lemmata = [
                lemma_dict[key] for key in sorted(lemma_dict.keys())
            ]
        else:
            lex.lemmata = self.static_lexicon.lemmata

        write_xml(self.out_bliss_lexicon.get_path(), lex.to_xml())
コード例 #3
0
    def run(self):
        lex = lexicon.Lexicon()

        phonemes = set()
        seen_lemma = {}
        with uopen(self.text_file.get_path()) as f:
            for line in f:
                # splitting is taken from RASR
                # src/Tools/Bliss/blissLexiconLib.py#L185
                s = line.split(None, 1)
                orth = s[0].split("\\", 1)[0]
                phon_variants = [
                    tuple(p.split()) for p in s[1].split("\\") if p.strip()
                ]
                for phon_variant in phon_variants:
                    phonemes.update(phon_variant)
                phon = [" ".join(v) for v in phon_variants]
                if orth in seen_lemma:
                    lemma = seen_lemma[orth]
                    for p in phon:
                        if p not in lemma.phon:
                            lemma.phon.append(p)
                else:
                    lemma = lexicon.Lemma(orth=[orth], phon=phon)
                    seen_lemma[orth] = lemma
                    lex.add_lemma(lemma)

        for phoneme in sorted(phonemes):
            lex.add_phoneme(phoneme)

        write_xml(self.out_bliss_lexicon.get_path(), lex.to_xml())
コード例 #4
0
    def run(self):
        lex = lexicon.Lexicon()
        lex.load(self.bliss_lexicon.get_path())

        orth2lemmata = collections.defaultdict(list)

        for lemma in lex.lemmata:
            if lemma.special:
                continue
            num_orths = len(lemma.orth)
            if num_orths < 1:
                continue
            if num_orths > 1 and not self.merge_multi_orths_lemmata:
                continue
            orth2lemmata[lemma.orth[0]].append(lemma)

        for orth, lemmata in orth2lemmata.items():
            if len(lemmata) < 2:
                continue
            final_lemma = lemmata[0]
            for lemma in lemmata[1:]:
                for orth in lemma.orth:
                    if orth not in final_lemma.orth:
                        final_lemma.orth.append(orth)
                for phon in lemma.phon:
                    if phon not in final_lemma.phon:
                        final_lemma.phon.append(phon)
                if final_lemma.synt is None and lemma.synt is not None:
                    final_lemma.synt = lemma.synt
                for eval in lemma.eval:
                    if eval not in final_lemma.eval:
                        final_lemma.eval.append(eval)
                lex.lemmata.remove(lemma)

        write_xml(self.out_bliss_lexicon, element_tree=lex.to_xml())
コード例 #5
0
    def run(self):

        in_lexicon = Lexicon()
        in_lexicon.load(self.bliss_lexicon.get_path())

        out_lexicon = Lexicon()

        for phoneme, variation in in_lexicon.phonemes.items():
            out_lexicon.add_phoneme(phoneme, variation)
            if not (phoneme.startswith("[") or phoneme.endswith("]")):
                out_lexicon.add_phoneme(phoneme + "#", variation)

        for lemma in in_lexicon.lemmata:
            lemma.phon = map(self.modify_phon, lemma.phon)
            out_lexicon.add_lemma(lemma)

        write_xml(self.out_lexicon.get_path(), out_lexicon.to_xml())
コード例 #6
0
    def run(self):
        merged_lex = lexicon.Lexicon()

        lexica = []
        for lexicon_path in self.lexica:
            lex = lexicon.Lexicon()
            lex.load(lexicon_path.get_path())
            lexica.append(lex)

        # combine the phonemes
        merged_phonemes = OrderedDict()
        for lex in lexica:
            for symbol, variation in lex.phonemes.items():
                if symbol in merged_phonemes.keys():
                    assert variation == merged_phonemes[symbol], (
                        "conflicting phoneme variant for phoneme: %s" % symbol)
                else:
                    merged_phonemes[symbol] = variation

        if self.sort_phonemes:
            sorted_phoneme_list = [(k, merged_phonemes[k])
                                   for k in sorted(merged_phonemes.keys())]
            for phoneme_tuple in sorted_phoneme_list:
                merged_lex.add_phoneme(symbol=phoneme_tuple[0],
                                       variation=phoneme_tuple[1])
        else:
            merged_lex.phonemes = merged_phonemes

        # combine the lemmata
        if self.sort_lemmata:
            lemma_dict = defaultdict(list)
            for lex in lexica:
                for lemma in lex.lemmata:
                    # sort by first orth entry
                    orth_key = lemma.orth[0] if lemma.orth else ""
                    lemma_dict[orth_key].append(lemma)
            merged_lex.lemmata = list(
                itertools.chain(
                    *[lemma_dict[key] for key in sorted(lemma_dict.keys())]))
        else:
            for lex in lexica:
                # check for existing orths to avoid overlap
                merged_lex.lemmata.extend(lex.lemmata)

        write_xml(self.out_bliss_lexicon.get_path(), merged_lex.to_xml())
コード例 #7
0
 def write_to_file(self, file):
     write_xml(file, self.get_questions())