def get_all_symbols(prep_dir: str) -> Set[str]: all_text_names = get_subfolder_names(prep_dir) all_symbols: Set[str] = set() for text_name in all_text_names: text_dir = get_text_dir(prep_dir, text_name, create=False) text_symbol_ids = load_text_symbol_converter(text_dir) all_symbols |= text_symbol_ids.get_all_symbols() return all_symbols
def get_infer_sentences(base_dir: str, prep_name: str, text_name: str) -> InferSentenceList: prep_dir = get_prepared_dir(base_dir, prep_name, create=False) text_dir = get_text_dir(prep_dir, text_name, create=False) if not os.path.isdir(text_dir): print(f"The text '{text_name}' doesn't exist.") assert False result = InferSentenceList.from_sentences( sentences=load_text_csv(text_dir), accents=load_prep_accents_ids(prep_dir), symbols=load_text_symbol_converter(text_dir)) return result
def _accent_template(base_dir: str, prep_name: str, text_name: str): prep_dir = get_prepared_dir(base_dir, prep_name, create=False) text_dir = get_text_dir(prep_dir, text_name, create=False) if not os.path.isdir(text_dir): print("Please add text first.") else: print("Updating accent template...") accented_symbol_list = infer_accents_template( sentences=load_text_csv(text_dir), text_symbols=load_text_symbol_converter(text_dir), accent_ids=load_prep_accents_ids(prep_dir), ) _save_accents_csv(text_dir, accented_symbol_list)
def normalize_text(base_dir: str, prep_name: str, text_name: str): prep_dir = get_prepared_dir(base_dir, prep_name, create=False) text_dir = get_text_dir(prep_dir, text_name, create=False) if not os.path.isdir(text_dir): print("Please add text first.") else: print("Normalizing text...") symbol_ids, updated_sentences = infer_norm( sentences=load_text_csv(text_dir), text_symbols=load_text_symbol_converter(text_dir)) print("\n" + updated_sentences.get_formatted( symbol_id_dict=symbol_ids, accent_id_dict=load_prep_accents_ids(prep_dir))) _save_text_csv(text_dir, updated_sentences) save_text_symbol_converter(text_dir, symbol_ids) _accent_template(base_dir, prep_name, text_name) _check_for_unknown_symbols(base_dir, prep_name, text_name)
def accent_apply(base_dir: str, prep_name: str, text_name: str): prep_dir = get_prepared_dir(base_dir, prep_name, create=False) text_dir = get_text_dir(prep_dir, text_name, create=False) if not os.path.isdir(text_dir): print("Please add text first.") else: print("Applying accents...") updated_sentences = infer_accents_apply( sentences=load_text_csv(text_dir), accented_symbols=_load_accents_csv(text_dir), accent_ids=load_prep_accents_ids(prep_dir), ) print("\n" + updated_sentences.get_formatted( symbol_id_dict=load_text_symbol_converter(text_dir), accent_id_dict=load_prep_accents_ids(prep_dir))) _save_text_csv(text_dir, updated_sentences) _check_for_unknown_symbols(base_dir, prep_name, text_name)
def add_text(base_dir: str, prep_name: str, text_name: str, filepath: str, lang: Language): prep_dir = get_prepared_dir(base_dir, prep_name, create=False) if not os.path.isdir(prep_dir): print("Please prepare data first.") else: print("Adding text...") symbol_ids, data = infer_add( text=read_text(filepath), lang=lang, ) print( "\n" + data.get_formatted(symbol_id_dict=symbol_ids, accent_id_dict=load_prep_accents_ids(prep_dir))) text_dir = get_text_dir(prep_dir, text_name, create=True) _save_text_csv(text_dir, data) save_text_symbol_converter(text_dir, symbol_ids) _accent_template(base_dir, prep_name, text_name) _check_for_unknown_symbols(base_dir, prep_name, text_name)
def ipa_convert_text(base_dir: str, prep_name: str, text_name: str, ignore_tones: bool = False, ignore_arcs: bool = True): prep_dir = get_prepared_dir(base_dir, prep_name, create=False) text_dir = get_text_dir(prep_dir, text_name, create=False) if not os.path.isdir(text_dir): print("Please add text first.") else: print("Converting text to IPA...") symbol_ids, updated_sentences = infer_convert_ipa( sentences=load_text_csv(text_dir), text_symbols=load_text_symbol_converter(text_dir), ignore_tones=ignore_tones, ignore_arcs=ignore_arcs) print("\n" + updated_sentences.get_formatted( symbol_id_dict=symbol_ids, accent_id_dict=load_prep_accents_ids(prep_dir))) _save_text_csv(text_dir, updated_sentences) save_text_symbol_converter(text_dir, symbol_ids) _accent_template(base_dir, prep_name, text_name) _check_for_unknown_symbols(base_dir, prep_name, text_name)
def map_text(base_dir: str, prep_name: str, text_name: str, symbols_map_path: str, ignore_arcs: bool = True): prep_dir = get_prepared_dir(base_dir, prep_name, create=False) text_dir = get_text_dir(prep_dir, text_name, create=False) if not os.path.isdir(text_dir): print("Please add text first.") else: symbol_ids, updated_sentences = sents_map( sentences=load_text_csv(text_dir), text_symbols=load_text_symbol_converter(text_dir), symbols_map=SymbolsMap.load(symbols_map_path), ignore_arcs=ignore_arcs) print("\n" + updated_sentences.get_formatted( symbol_id_dict=symbol_ids, accent_id_dict=load_prep_accents_ids(prep_dir))) _save_text_csv(text_dir, updated_sentences) save_text_symbol_converter(text_dir, symbol_ids) _accent_template(base_dir, prep_name, text_name) _check_for_unknown_symbols(base_dir, prep_name, text_name)