def _get_all_speakers(l: PreDataList) -> Tuple[SpeakersDict, SpeakersLogDict]: all_speakers: List[str] = [x.speaker_name for x in l.items()] all_speakers_count = Counter(all_speakers) speakers_log = SpeakersLogDict.fromcounter(all_speakers_count) all_speakers = remove_duplicates_list_orderpreserving(all_speakers) speakers_dict = SpeakersDict.fromlist(all_speakers) return speakers_dict, speakers_log
def parse(dir_path: str, logger: Logger = getLogger()) -> PreDataList: if not os.path.exists(dir_path): logger.exception(f"Directory not found: {dir_path}!") raise Exception() readme_path = os.path.join(dir_path, "README.md") readme = read_lines(readme_path) readme = readme[34:58] speakers_dict = {} for speaker_details in readme: name, gender, accent, _, _ = speaker_details[1:-1].split("|") speakers_dict[name] = gender, accent speaker_folders = get_subfolders(dir_path) lang = Language.ENG entries = PreDataList() logger.info("Parsing files...") for speaker_folder in tqdm(speaker_folders): speaker_name = get_basename(speaker_folder) if speaker_name not in speakers_dict.keys(): logger.info(f"Skipping {speaker_name}") continue wavs = get_filepaths(os.path.join(speaker_folder, "wav")) # only 150, they do not contain good IPA annotations = get_filepaths(os.path.join(speaker_folder, "annotation")) textgrids = get_filepaths(os.path.join(speaker_folder, "textgrid")) transcripts = get_filepaths(os.path.join(speaker_folder, "transcript")) assert len(wavs) == len(textgrids) == len(transcripts) speaker_name = get_basename(speaker_folder) speaker_gender, speaker_accent = speakers_dict[speaker_name] accent_name = f"{speaker_accent}-{speaker_name}" gender = Gender.MALE if speaker_gender == "M" else Gender.FEMALE for wav, textgrid, transcript in zip(wavs, textgrids, transcripts): text_en = read_text(transcript) text_en = f"{text_en}." symbols = text_to_symbols(text_en, lang) entry = PreData(name=get_basename(wav), speaker_name=speaker_name, text=text_en, wav_path=wav, symbols=symbols, accents=[accent_name] * len(symbols), gender=gender, lang=lang) entries.append(entry) entries.sort(key=sort_arctic, reverse=False) logger.info( f"Parsed {len(entries)} entries from {len(speakers_dict)} speakers.") return entries
def parse(dir_path: str) -> PreDataList: if not os.path.exists(dir_path): print("Directory not found:", dir_path) raise Exception() result = PreDataList() lang = Language.ENG tmp: List[Tuple[Tuple, PreDataList]] = list() subfolders = get_subfolders(dir_path) for subfolder in tqdm(subfolders): data_path = os.path.join(subfolder, OATA_CSV_NAME) entries = cast_as(Entries.load(Entry, data_path), Entries) for entry in entries.items(): gender = Gender.MALE if entry.gender == "m" else Gender.FEMALE symbols = text_to_symbols(entry.text, lang) wav_path = os.path.join(subfolder, AUDIO_FOLDER_NAME, entry.wav) data = PreData( name=entry.entry_id, speaker_name=entry.speaker, lang=lang, wav_path=wav_path, gender=gender, text=entry.text, symbols=symbols, accents=[entry.accent] * len(symbols), ) sorting_keys = entry.speaker, subfolder, entry.entry_id tmp.append((sorting_keys, data)) tmp.sort(key=lambda x: x[0]) result = PreDataList([x for _, x in tmp]) return result
def _get_ds_data(l: PreDataList, speakers_dict: SpeakersDict, accents: AccentsDict, symbols: SymbolIdDict) -> DsDataList: result = [ DsData(entry_id=i, basename=values.name, speaker_name=values.speaker_name, speaker_id=speakers_dict[values.speaker_name], text=values.text, serialized_symbols=symbols.get_serialized_ids(values.symbols), serialized_accents=accents.get_serialized_ids(values.accents), wav_path=values.wav_path, lang=values.lang, gender=values.gender) for i, values in enumerate(l.items()) ] return DsDataList(result)
def parse(dir_path: str, logger: Logger = getLogger()) -> PreDataList: if not os.path.exists(dir_path): print("Directory not found:", dir_path) raise Exception() speakers_path = os.path.join(dir_path, "SPEAKERS.txt") speakers = read_lines(speakers_path) speakers = speakers[12:] speakers_dict = {} for speaker_details in speakers: s_id, gender, _, _, name = speaker_details.split(" | ") speakers_dict[s_id.strip()] = name.strip(), gender.strip() lang = Language.ENG entries = PreDataList() logger.info("Parsing files...") for dataset_folder in tqdm(get_subfolders(dir_path)): logger.info(f"Parsing {get_basename(dataset_folder)}...") for speaker_folder in tqdm(get_subfolders(dataset_folder)): speaker_id = get_basename(speaker_folder) speaker_name, speaker_gender = speakers_dict[speaker_id] accent_name = speaker_name gender = Gender.MALE if speaker_gender == "M" else Gender.FEMALE for chapter_folder in get_subfolders(speaker_folder): files = get_filepaths(chapter_folder) wavs = [x for x in files if x.endswith(".wav")] texts = [x for x in files if x.endswith(".normalized.txt")] assert len(wavs) == len(texts) for wav_file, text_file in zip(wavs, texts): assert get_basename(wav_file) == get_basename( text_file)[:-len(".normalized")] text_en = read_text(text_file) symbols = text_to_symbols(text_en, lang) entry = PreData(name=get_basename(wav_file), speaker_name=speaker_name, text=text_en, wav_path=wav_file, symbols=symbols, accents=[accent_name] * len(symbols), gender=gender, lang=lang) entries.append(entry) entries.sort(key=sort_libri, reverse=False) logger.info( f"Parsed {len(entries)} entries from {len(speakers_dict)} speakers.") return entries
def parse(path: str) -> PreDataList: if not os.path.exists(path): print("Directory not found:", path) raise Exception() metadata_filepath = os.path.join(path, 'metadata.csv') if not os.path.exists(metadata_filepath): print("Metadatafile not found:", metadata_filepath) raise Exception() wav_dirpath = os.path.join(path, 'wavs') if not os.path.exists(wav_dirpath): print("WAVs not found:", wav_dirpath) raise Exception() result = PreDataList() speaker_name = '1' accent_name = "north_america" lang = Language.ENG gender = Gender.FEMALE lines = read_lines(metadata_filepath) print("Parsing files...") for line in tqdm(lines): parts = line.split('|') basename = parts[0] # parts[1] contains years, in parts[2] the years are written out # ex. ['LJ001-0045', '1469, 1470;', 'fourteen sixty-nine, fourteen seventy;'] wav_path = os.path.join(wav_dirpath, f'{basename}.wav') text = parts[2] symbols = text_to_symbols(text, lang) entry = PreData(name=basename, speaker_name=speaker_name, text=text, wav_path=wav_path, symbols=symbols, accents=[accent_name] * len(symbols), gender=gender, lang=lang) result.append(entry) result.sort(key=sort_ljs, reverse=False) print("Done.") return result
def parse(dir_path: str) -> PreDataList: if not os.path.exists(dir_path): print("Directory not found:", dir_path) raise Exception() sent_paths = os.path.join(dir_path, "data", "*.trn") wav_paths = os.path.join(dir_path, "data", "*.wav") sent_files = glob.glob(sent_paths) wav_files = glob.glob(wav_paths) sent_files_gen = ["{}.trn".format(x) for x in wav_files] wavs_sents = sorted(tuple(zip(wav_files, sent_files_gen))) skipped = [x for x in wavs_sents if x[1] not in sent_files] wavs_sents = [x for x in wavs_sents if x[1] in sent_files] print("Skipped:", len(skipped), "of", len(sent_files_gen)) # print(skipped) res = PreDataList() print("Parsing files...") for wav, sent_file in tqdm(wavs_sents): content = read_lines(sent_file) chn = content[0].strip() # remove "=" from chinese transcription because it is not correct # occurs only in sentences with nr. 374, e.g. B22_374 chn = chn.replace("= ", '') basename = os.path.basename(wav)[:-4] speaker, nr = basename.split("_") nr = int(nr) #res.append((nr, speaker, basename, wav, chn, sent_file)) symbols = text_to_symbols(chn, Language.CHN) accents = [speaker] * len(symbols) tmp = PreData(basename, speaker, chn, wav, symbols, accents, Gender.FEMALE, Language.CHN) # TODO Gender res.append(tmp) print("Done.") x: PreData res.sort(key=lambda x: x.name) return res
def parse(dir_path: str) -> PreDataList: if not os.path.exists(dir_path): print("Directory not found:", dir_path) raise Exception() train_words = os.path.join(dir_path, 'doc/trans/train.word.txt') test_words = os.path.join(dir_path, 'doc/trans/test.word.txt') train_wavs = os.path.join(dir_path, 'wav/train/') test_wavs = os.path.join(dir_path, 'wav/test/') parse_paths = [ (train_words, train_wavs), (test_words, test_wavs) ] files: List[Tuple[Tuple[str, int, int], PreData]] = [] lang = Language.CHN print("Parsing files...") for words_path, wavs_dir in parse_paths: lines = read_lines(words_path) for x in tqdm(lines): pos = x.find(' ') name, chinese = x[:pos], x[pos + 1:] speaker_name, nr = name.split("_") speaker_gender = Gender.MALE if speaker_name in MALE_SPEAKERS else Gender.FEMALE nr = int(nr) speaker_name_letter = speaker_name[0] speaker_name_number = int(speaker_name[1:]) wav_path = os.path.join(wavs_dir, speaker_name, name + '.wav') exists = os.path.exists(wav_path) if not exists: wav_path = os.path.join(wavs_dir, speaker_name, name + '.WAV') exists = os.path.exists(wav_path) if not exists: print("Not found wav file:", wav_path) continue # remove "=" from chinese transcription because it is not correct # occurs only in sentences with nr. 374, e.g. B22_374 chinese = chinese.replace("= ", '') is_question = str.endswith(chinese, QUESTION_PARTICLE_1) or str.endswith( chinese, QUESTION_PARTICLE_2) if is_question: chinese += "?" else: chinese += "。" symbols = text_to_symbols(chinese, lang) accent_name = speaker_name if speaker_name in ACCENTS.keys(): accent_name = ACCENTS[speaker_name] entry = PreData( name=name, speaker_name=speaker_name, text=chinese, wav_path=wav_path, symbols=symbols, accents=[accent_name] * len(symbols), gender=speaker_gender, lang=lang ) files.append((entry, (speaker_name_letter, speaker_name_number, nr))) files.sort(key=lambda tup: tup[1], reverse=False) res = PreDataList([x for x, _ in files]) return res
def _get_symbols_id_dict(l: PreDataList) -> SymbolIdDict: symbols = set() for x in l.items(): symbols = symbols.union(set(x.symbols)) return SymbolIdDict.init_from_symbols(symbols)
def _get_all_accents(l: PreDataList) -> AccentsDict: accents = set() for x in l.items(): accents = accents.union(set(x.accents)) return AccentsDict.init_from_accents(accents)