def run(self): in_lexicon = Lexicon() in_lexicon.load(self.bliss_lexicon.get_path()) out_lexicon = Lexicon() out_lexicon.lemmata = in_lexicon.lemmata silence_phon = None for lemma in out_lexicon.lemmata: if lemma.special == "silence": if self.delete_empty_orth: orths = [] for orth in lemma.orth: if orth == "": continue orths.append(orth) lemma.orth = orths silence_phon = lemma.phon[0] assert len(lemma.phon) == 1, ( "Silence lemma does not have only one phoneme" ) assert silence_phon, ( "No silence lemma found" ) out_lexicon.add_phoneme(silence_phon, in_lexicon.phonemes[silence_phon]) for phoneme, variation in in_lexicon.phonemes.items(): if phoneme == silence_phon: continue out_lexicon.add_phoneme(phoneme, variation) write_xml(self.out_lexicon.get_path(), out_lexicon.to_xml())
def run(self): lex = lexicon.Lexicon() if self.sort_phonemes: sorted_phoneme_list = [ (k, self.static_lexicon.phonemes[k]) for k in sorted(self.static_lexicon.phonemes.keys()) ] for phoneme_tuple in sorted_phoneme_list: lex.add_phoneme(symbol=phoneme_tuple[0], variation=phoneme_tuple[1]) else: lex.phonemes = self.static_lexicon.phonemes if self.sort_lemmata: lemma_dict = {} for lemma in self.static_lexicon.lemmata: # sort by first orth entry lemma_dict[lemma.orth[0]] = lemma lex.lemmata = [ lemma_dict[key] for key in sorted(lemma_dict.keys()) ] else: lex.lemmata = self.static_lexicon.lemmata write_xml(self.out_bliss_lexicon.get_path(), lex.to_xml())
def run(self): lex = lexicon.Lexicon() phonemes = set() seen_lemma = {} with uopen(self.text_file.get_path()) as f: for line in f: # splitting is taken from RASR # src/Tools/Bliss/blissLexiconLib.py#L185 s = line.split(None, 1) orth = s[0].split("\\", 1)[0] phon_variants = [ tuple(p.split()) for p in s[1].split("\\") if p.strip() ] for phon_variant in phon_variants: phonemes.update(phon_variant) phon = [" ".join(v) for v in phon_variants] if orth in seen_lemma: lemma = seen_lemma[orth] for p in phon: if p not in lemma.phon: lemma.phon.append(p) else: lemma = lexicon.Lemma(orth=[orth], phon=phon) seen_lemma[orth] = lemma lex.add_lemma(lemma) for phoneme in sorted(phonemes): lex.add_phoneme(phoneme) write_xml(self.out_bliss_lexicon.get_path(), lex.to_xml())
def run(self): lex = lexicon.Lexicon() lex.load(self.bliss_lexicon.get_path()) orth2lemmata = collections.defaultdict(list) for lemma in lex.lemmata: if lemma.special: continue num_orths = len(lemma.orth) if num_orths < 1: continue if num_orths > 1 and not self.merge_multi_orths_lemmata: continue orth2lemmata[lemma.orth[0]].append(lemma) for orth, lemmata in orth2lemmata.items(): if len(lemmata) < 2: continue final_lemma = lemmata[0] for lemma in lemmata[1:]: for orth in lemma.orth: if orth not in final_lemma.orth: final_lemma.orth.append(orth) for phon in lemma.phon: if phon not in final_lemma.phon: final_lemma.phon.append(phon) if final_lemma.synt is None and lemma.synt is not None: final_lemma.synt = lemma.synt for eval in lemma.eval: if eval not in final_lemma.eval: final_lemma.eval.append(eval) lex.lemmata.remove(lemma) write_xml(self.out_bliss_lexicon, element_tree=lex.to_xml())
def run(self): in_lexicon = Lexicon() in_lexicon.load(self.bliss_lexicon.get_path()) out_lexicon = Lexicon() for phoneme, variation in in_lexicon.phonemes.items(): out_lexicon.add_phoneme(phoneme, variation) if not (phoneme.startswith("[") or phoneme.endswith("]")): out_lexicon.add_phoneme(phoneme + "#", variation) for lemma in in_lexicon.lemmata: lemma.phon = map(self.modify_phon, lemma.phon) out_lexicon.add_lemma(lemma) write_xml(self.out_lexicon.get_path(), out_lexicon.to_xml())
def run(self): merged_lex = lexicon.Lexicon() lexica = [] for lexicon_path in self.lexica: lex = lexicon.Lexicon() lex.load(lexicon_path.get_path()) lexica.append(lex) # combine the phonemes merged_phonemes = OrderedDict() for lex in lexica: for symbol, variation in lex.phonemes.items(): if symbol in merged_phonemes.keys(): assert variation == merged_phonemes[symbol], ( "conflicting phoneme variant for phoneme: %s" % symbol) else: merged_phonemes[symbol] = variation if self.sort_phonemes: sorted_phoneme_list = [(k, merged_phonemes[k]) for k in sorted(merged_phonemes.keys())] for phoneme_tuple in sorted_phoneme_list: merged_lex.add_phoneme(symbol=phoneme_tuple[0], variation=phoneme_tuple[1]) else: merged_lex.phonemes = merged_phonemes # combine the lemmata if self.sort_lemmata: lemma_dict = defaultdict(list) for lex in lexica: for lemma in lex.lemmata: # sort by first orth entry orth_key = lemma.orth[0] if lemma.orth else "" lemma_dict[orth_key].append(lemma) merged_lex.lemmata = list( itertools.chain( *[lemma_dict[key] for key in sorted(lemma_dict.keys())])) else: for lex in lexica: # check for existing orths to avoid overlap merged_lex.lemmata.extend(lex.lemmata) write_xml(self.out_bliss_lexicon.get_path(), merged_lex.to_xml())
def write_to_file(self, file): write_xml(file, self.get_questions())