def update_symbols_and_text(sentences: SentenceList, sents_new_symbols: List[List[str]]): symbols = SymbolIdDict.init_from_symbols(get_unique_items(sents_new_symbols)) for sentence, new_symbols in zip(sentences.items(), sents_new_symbols): sentence.serialized_symbols = symbols.get_serialized_ids(new_symbols) sentence.text = SymbolIdDict.symbols_to_text(new_symbols) assert len(sentence.get_symbol_ids()) == len(new_symbols) assert len(sentence.get_accent_ids()) == len(new_symbols) return symbols, sentences
def sents_map(sentences: SentenceList, text_symbols: SymbolIdDict, symbols_map: SymbolsMap, ignore_arcs: bool, logger: Logger) -> Tuple[SymbolIdDict, SentenceList]: sents_new_symbols = [] result = SentenceList() new_sent_id = 0 ipa_settings = IPAExtractionSettings( ignore_tones=False, ignore_arcs=ignore_arcs, replace_unknown_ipa_by=DEFAULT_PADDING_SYMBOL, ) for sentence in sentences.items(): symbols = text_symbols.get_symbols(sentence.serialized_symbols) accent_ids = deserialize_list(sentence.serialized_accents) mapped_symbols = symbols_map.apply_to_symbols(symbols) text = SymbolIdDict.symbols_to_text(mapped_symbols) # a resulting empty text would make no problems sents = text_to_sentences( text=text, lang=sentence.lang, logger=logger, ) for new_sent_text in sents: new_symbols = text_to_symbols( new_sent_text, lang=sentence.lang, ipa_settings=ipa_settings, logger=logger, ) if len(accent_ids) > 0: new_accent_ids = [accent_ids[0]] * len(new_symbols) else: new_accent_ids = [] assert len(new_accent_ids) == len(new_symbols) new_sent_id += 1 tmp = Sentence( sent_id=new_sent_id, text=new_sent_text, lang=sentence.lang, orig_lang=sentence.orig_lang, # this is not correct but nearest possible currently original_text=sentence.original_text, serialized_accents=serialize_list(new_accent_ids), serialized_symbols="" ) sents_new_symbols.append(new_symbols) assert len(tmp.get_accent_ids()) == len(new_symbols) result.append(tmp) return update_symbols_and_text(result, sents_new_symbols)
def replace_unknown_symbols(self, model_symbols: SymbolIdDict, logger: Logger) -> bool: unknown_symbols_exist = False for sentence in self.items(): if model_symbols.has_unknown_symbols(sentence.symbols): sentence.symbols = model_symbols.replace_unknown_symbols_with_pad( sentence.symbols, pad_symbol=DEFAULT_PADDING_SYMBOL) text = SymbolIdDict.symbols_to_text(sentence.symbols) logger.info(f"Sentence {sentence.sent_id} contains unknown symbols: {text}") unknown_symbols_exist = True assert len(sentence.symbols) == len(sentence.accents) return unknown_symbols_exist
def add_text(text: str, lang: Language, logger: Logger) -> Tuple[SymbolIdDict, SentenceList]: res = SentenceList() # each line is at least regarded as one sentence. lines = text.split("\n") all_sents = [] for line in lines: sents = text_to_sentences( text=line, lang=lang, logger=logger, ) all_sents.extend(sents) default_accent_id = 0 ipa_settings = IPAExtractionSettings( ignore_tones=False, ignore_arcs=False, replace_unknown_ipa_by=DEFAULT_PADDING_SYMBOL, ) sents_symbols: List[List[str]] = [text_to_symbols( sent, lang=lang, ipa_settings=ipa_settings, logger=logger, ) for sent in all_sents] symbols = SymbolIdDict.init_from_symbols(get_unique_items(sents_symbols)) for i, sent_symbols in enumerate(sents_symbols): sentence = Sentence( sent_id=i + 1, lang=lang, serialized_symbols=symbols.get_serialized_ids(sent_symbols), serialized_accents=serialize_list([default_accent_id] * len(sent_symbols)), text=SymbolIdDict.symbols_to_text(sent_symbols), original_text=SymbolIdDict.symbols_to_text(sent_symbols), orig_lang=lang, ) res.append(sentence) return symbols, res