def sents_convert_to_ipa(sentences: SentenceList, text_symbols: SymbolIdDict, ignore_tones: bool, ignore_arcs: bool, mode: Optional[EngToIpaMode], consider_ipa_annotations: bool, logger: Logger) -> Tuple[SymbolIdDict, SentenceList]: sents_new_symbols = [] for sentence in sentences.items(True): if sentence.lang == Language.ENG and mode is None: ex = "Please specify the ipa conversion mode." logger.exception(ex) raise Exception(ex) new_symbols, new_accent_ids = symbols_to_ipa( symbols=text_symbols.get_symbols(sentence.serialized_symbols), lang=sentence.lang, accent_ids=deserialize_list(sentence.serialized_accents), ignore_arcs=ignore_arcs, ignore_tones=ignore_tones, mode=mode, replace_unknown_with=DEFAULT_PADDING_SYMBOL, consider_ipa_annotations=consider_ipa_annotations, logger=logger, ) assert len(new_symbols) == len(new_accent_ids) sentence.lang = Language.IPA sentence.serialized_accents = serialize_list(new_accent_ids) sents_new_symbols.append(new_symbols) assert len(sentence.get_accent_ids()) == len(new_symbols) return update_symbols_and_text(sentences, sents_new_symbols)
def set_accent(sentences: SentenceList, accent_ids: AccentsDict, accent: str) -> Tuple[SymbolIdDict, SentenceList]: accent_id = accent_ids.get_id(accent) for sentence in sentences.items(): new_accent_ids = [accent_id] * len(sentence.get_accent_ids()) sentence.serialized_accents = serialize_list(new_accent_ids) assert len(sentence.get_accent_ids()) == len(sentence.get_symbol_ids()) return sentences
def sents_map(sentences: SentenceList, text_symbols: SymbolIdDict, symbols_map: SymbolsMap, ignore_arcs: bool, logger: Logger) -> Tuple[SymbolIdDict, SentenceList]: sents_new_symbols = [] result = SentenceList() new_sent_id = 0 ipa_settings = IPAExtractionSettings( ignore_tones=False, ignore_arcs=ignore_arcs, replace_unknown_ipa_by=DEFAULT_PADDING_SYMBOL, ) for sentence in sentences.items(): symbols = text_symbols.get_symbols(sentence.serialized_symbols) accent_ids = deserialize_list(sentence.serialized_accents) mapped_symbols = symbols_map.apply_to_symbols(symbols) text = SymbolIdDict.symbols_to_text(mapped_symbols) # a resulting empty text would make no problems sents = text_to_sentences( text=text, lang=sentence.lang, logger=logger, ) for new_sent_text in sents: new_symbols = text_to_symbols( new_sent_text, lang=sentence.lang, ipa_settings=ipa_settings, logger=logger, ) if len(accent_ids) > 0: new_accent_ids = [accent_ids[0]] * len(new_symbols) else: new_accent_ids = [] assert len(new_accent_ids) == len(new_symbols) new_sent_id += 1 tmp = Sentence( sent_id=new_sent_id, text=new_sent_text, lang=sentence.lang, orig_lang=sentence.orig_lang, # this is not correct but nearest possible currently original_text=sentence.original_text, serialized_accents=serialize_list(new_accent_ids), serialized_symbols="" ) sents_new_symbols.append(new_symbols) assert len(tmp.get_accent_ids()) == len(new_symbols) result.append(tmp) return update_symbols_and_text(result, sents_new_symbols)
def sents_accent_apply(sentences: SentenceList, accented_symbols: AccentedSymbolList, accent_ids: AccentsDict) -> SentenceList: current_index = 0 for sent in sentences.items(): accent_ids_count = len(deserialize_list(sent.serialized_accents)) assert len(accented_symbols) >= current_index + accent_ids_count accented_symbol_selection: List[AccentedSymbol] = accented_symbols[current_index:current_index + accent_ids_count] current_index += accent_ids_count new_accent_ids = accent_ids.get_ids([x.accent for x in accented_symbol_selection]) sent.serialized_accents = serialize_list(new_accent_ids) assert len(sent.get_accent_ids()) == len(sent.get_symbol_ids()) return sentences
def sents_normalize(sentences: SentenceList, text_symbols: SymbolIdDict, logger: Logger) -> Tuple[SymbolIdDict, SentenceList]: # Maybe add info if something was unknown sents_new_symbols = [] for sentence in sentences.items(): new_symbols, new_accent_ids = symbols_normalize( symbols=text_symbols.get_symbols(sentence.serialized_symbols), lang=sentence.lang, accent_ids=deserialize_list(sentence.serialized_accents), logger=logger, ) # TODO: check if new sentences resulted and then split them. sentence.serialized_accents = serialize_list(new_accent_ids) sents_new_symbols.append(new_symbols) return update_symbols_and_text(sentences, sents_new_symbols)
def add_text(text: str, lang: Language, logger: Logger) -> Tuple[SymbolIdDict, SentenceList]: res = SentenceList() # each line is at least regarded as one sentence. lines = text.split("\n") all_sents = [] for line in lines: sents = text_to_sentences( text=line, lang=lang, logger=logger, ) all_sents.extend(sents) default_accent_id = 0 ipa_settings = IPAExtractionSettings( ignore_tones=False, ignore_arcs=False, replace_unknown_ipa_by=DEFAULT_PADDING_SYMBOL, ) sents_symbols: List[List[str]] = [text_to_symbols( sent, lang=lang, ipa_settings=ipa_settings, logger=logger, ) for sent in all_sents] symbols = SymbolIdDict.init_from_symbols(get_unique_items(sents_symbols)) for i, sent_symbols in enumerate(sents_symbols): sentence = Sentence( sent_id=i + 1, lang=lang, serialized_symbols=symbols.get_serialized_ids(sent_symbols), serialized_accents=serialize_list([default_accent_id] * len(sent_symbols)), text=SymbolIdDict.symbols_to_text(sent_symbols), original_text=SymbolIdDict.symbols_to_text(sent_symbols), orig_lang=lang, ) res.append(sentence) return symbols, res