Ejemplo n.º 1
0
def parse(dir_path: Path) -> PreDataList:
    logger = getLogger(__name__)
    if not dir_path.exists():
        ex = ValueError(f"Directory not found: {dir_path}")
        logger.error("", exc_info=ex)
        raise ex

    metadata_filepath = dir_path / 'metadata.csv'

    if not metadata_filepath.exists():
        ex = ValueError(f"Metadatafile not found: {metadata_filepath}")
        logger.error("", exc_info=ex)
        raise ex

    wav_dirpath = dir_path / 'wavs'

    if not wav_dirpath.exists():
        ex = ValueError(f"WAVs not found: {wav_dirpath}")
        logger.error("", exc_info=ex)
        raise ex

    result = PreDataList()
    speaker_name = 'Linda Johnson'
    accent_name = "North American"
    lang = Language.ENG
    gender = Gender.FEMALE
    text_format = TextFormat.GRAPHEMES

    lines = read_lines(metadata_filepath)
    logger.info("Parsing files...")
    for line in tqdm(lines):
        parts = line.split('|')
        basename = parts[0]
        # parts[1] contains years, in parts[2] the years are written out
        # e.g. ['LJ001-0045', '1469, 1470;', 'fourteen sixty-nine, fourteen seventy;']
        wav_path = wav_dirpath / f'{basename}.wav'
        text = parts[2]
        text_en_symbols = tuple(text)

        entry = PreData(
            identifier=0,
            basename=basename,
            speaker_name=speaker_name,
            speaker_accent=accent_name,
            symbols=text_en_symbols,
            symbols_format=text_format,
            relative_audio_path=wav_path.relative_to(dir_path),
            speaker_gender=gender,
            symbols_language=lang,
        )

        result.append(entry)

    result.sort(key=sort_ljs, reverse=False)
    result.set_identifiers()

    logger.info("Done.")

    return result
Ejemplo n.º 2
0
def parse(dir_path: Path) -> PreDataList:
  logger = getLogger(__name__)
  if not dir_path.exists():
    ex = ValueError(f"Directory not found: {dir_path}")
    logger.error("", exc_info=ex)
    raise ex

  speakers_path = dir_path / SPEAKERS_TXT
  speakers = read_lines(speakers_path)
  speakers = speakers[12:]
  speakers_dict = {}
  for speaker_details in speakers:
    s_id, gender, _, _, name = speaker_details.split(" | ")
    speakers_dict[s_id.strip()] = name.strip(), gender.strip()

  entries = PreDataList()

  logger.info("Detecting files...")
  arguments: List[Tuple[Path, Path, Gender, str, str]] = []
  for dataset_folder in tqdm(get_subfolders(dir_path)):
    logger.info(f"Parsing {get_basename(dataset_folder)}...")

    for speaker_folder in tqdm(get_subfolders(dataset_folder)):
      speaker_id = get_basename(speaker_folder)
      speaker_name, speaker_gender = speakers_dict[speaker_id]
      accent_name = speaker_name
      gender = Gender.MALE if speaker_gender == "M" else Gender.FEMALE

      for chapter_folder in get_subfolders(speaker_folder):
        files = get_filepaths(chapter_folder)
        wav_paths = [file for file in files if file.suffix == ".wav"]
        text_paths = [file for file in files if str(file).endswith(".normalized.txt")]
        if len(wav_paths) != len(text_paths):
          raise Exception()

        for wav_file, text_file in zip(wav_paths, text_paths):
          assert get_basename(wav_file) == get_basename(text_file)[:-len(".normalized")]
          arguments.append((wav_file, text_file, gender, accent_name, speaker_name))

  mt_method = partial(
    get_entry,
    dir_path=dir_path,
  )

  logger.info("Parsing content...")
  with ThreadPoolExecutor(max_workers=cpu_count() - 1) as ex:
    entries = PreDataList(tqdm(ex.map(mt_method, arguments), total=len(arguments)))
    #futures = [ex.submit(method) for method in methods]
    #entries = PreDataList(future.result() for future in futures)
  logger.info("Done.")

  entries.sort(key=sort_libri, reverse=False)
  entries.set_identifiers()

  logger.info(f"Parsed {len(entries)} entries from {len(speakers_dict)} speakers.")

  return entries
Ejemplo n.º 3
0
def export_transcriptions_to_folder(data: PreDataList, target_folder: Path,
                                    overwrite: bool) -> None:
    logger = getLogger(__name__)
    target_folder.mkdir(parents=True, exist_ok=True)

    attach_nrs = not data_has_unique_identifiers(data)
    if attach_nrs:
        logger.info(
            "The identifiers of the data are not unique therefore adding an index at the start of the entry name."
        )

    for entry_id, entry in enumerate(data.items()):
        prepend_str = f"{entry_id}-" if attach_nrs else ""
        target_txt_path = target_folder / f"{prepend_str}{entry.relative_audio_path.stem}.txt"
        if target_txt_path.is_file():
            if not overwrite:
                logger.info(f"Skipping existing file: {target_txt_path}")
                continue
            os.remove(target_txt_path)
        target_txt_path.write_text(entry.symbols)

    logger.info(f"Written output to: {target_folder}")
Ejemplo n.º 4
0
def parse(dir_path: Path) -> PreDataList:
    logger = getLogger(__name__)
    if not dir_path.exists():
        ex = ValueError(f"Directory not found: {dir_path}")
        logger.error("", exc_info=ex)
        raise ex

    sent_paths = dir_path / "data" / "*.trn"
    wav_paths = dir_path / "data" / "*.wav"
    existing_sent_files = set(glob.glob(str(sent_paths)))
    wav_files = [Path(file) for file in glob.glob(str(wav_paths))]
    generated_sent_files = [Path(f"{wav_file}.trn") for wav_file in wav_files]

    wavs_sents = sorted(tuple(zip(wav_files, generated_sent_files)))
    skipped = [
        wav_file for wav_file, sent_file_gen in wavs_sents
        if str(sent_file_gen) not in existing_sent_files
    ]
    wavs_sents_filtered = [(wav_file, sent_file_gen)
                           for wav_file, sent_file_gen in wavs_sents
                           if str(sent_file_gen) in existing_sent_files]

    logger.info(f"Skipped: {len(skipped)} of {len(generated_sent_files)}")

    res = PreDataList()
    lang = Language.CHN
    text_format = TextFormat.GRAPHEMES
    logger.info("Parsing files...")
    wav_file: Path
    sent_file: Path
    for wav_file, sent_file in tqdm(wavs_sents_filtered):
        content = read_lines(sent_file)
        chn = content[0].strip()
        # remove "=" from chinese transcription because it is not correct
        # occurs only in sentences with nr. 374, e.g. B22_374
        chn = chn.replace("= ", '')
        symbols = tuple(chn)

        basename = get_basename(wav_file)
        speaker, nr = basename.split("_")
        nr = int(nr)
        #res.append((nr, speaker, basename, wav, chn, sent_file))

        # TODO Gender
        tmp = PreData(
            identifier=0,
            basename=basename,
            speaker_name=speaker,
            speaker_accent=speaker,
            symbols=symbols,
            symbols_format=text_format,
            relative_audio_path=wav_file.relative_to(dir_path),
            speaker_gender=Gender.FEMALE,
            symbols_language=lang,
        )

        res.append(tmp)

    res.sort(key=sort_ds)
    res.set_identifiers()
    logger.info("Done.")

    return res
Ejemplo n.º 5
0
def parse(dir_path: Path) -> PreDataList:
    logger = getLogger(__name__)
    if not dir_path.exists():
        ex = ValueError(f"Directory not found: {dir_path}")
        logger.error("", exc_info=ex)
        raise ex

    train_words = dir_path / 'doc/trans/train.word.txt'
    test_words = dir_path / 'doc/trans/test.word.txt'
    train_wavs = dir_path / 'wav/train/'
    test_wavs = dir_path / 'wav/test/'

    parse_paths = [(train_words, train_wavs), (test_words, test_wavs)]

    files: List[Tuple[Tuple[str, int, int], PreData]] = []
    lang = Language.CHN
    text_format = TextFormat.GRAPHEMES

    logger.info("Parsing files...")
    for words_path, wavs_dir in parse_paths:
        lines = read_lines(words_path)

        x: str
        for x in tqdm(lines):
            pos = x.find(' ')
            name, chinese = x[:pos], x[pos + 1:]

            speaker_name, nr = name.split("_")
            speaker_gender = Gender.MALE if speaker_name in MALE_SPEAKERS else Gender.FEMALE
            nr = int(nr)
            speaker_name_letter = speaker_name[0]
            speaker_name_number = int(speaker_name[1:])
            wav_path = wavs_dir / speaker_name / f"{name}.wav"
            if not wav_path.exists():
                wav_path = wavs_dir / speaker_name / f"{name}.WAV"
            if not wav_path.exists():
                logger.info(f"Found no wav file: {wav_path}")
                continue

            # remove "=" from chinese transcription because it is not correct
            # occurs only in sentences with nr. 374, e.g. B22_374
            chinese = chinese.replace("= ", '')
            is_question = str.endswith(chinese,
                                       QUESTION_PARTICLE_1) or str.endswith(
                                           chinese, QUESTION_PARTICLE_2)
            if is_question:
                chinese += "?"
            else:
                chinese += "。"
            symbols = tuple(chinese)

            accent_name = speaker_name
            if speaker_name in ACCENTS.keys():
                accent_name = ACCENTS[speaker_name]

            entry = PreData(
                identifier=0,
                basename=name,
                speaker_name=speaker_name,
                symbols=symbols,
                symbols_format=text_format,
                speaker_accent=accent_name,
                relative_audio_path=wav_path.relative_to(dir_path),
                speaker_gender=speaker_gender,
                symbols_language=lang,
            )

            files.append(
                (entry, (speaker_name_letter, speaker_name_number, nr)))

    files.sort(key=lambda tup: tup[1], reverse=False)
    res = PreDataList([x for x, _ in files])
    res.set_identifiers()

    return res
Ejemplo n.º 6
0
def data_has_unique_identifiers(data: PreDataList) -> bool:
    identifiers = [entry.identifier for entry in data.items()]
    all_identifiers_are_unique = len(set(identifiers)) == len(identifiers)
    return all_identifiers_are_unique
Ejemplo n.º 7
0
def parse(dir_path: Path) -> PreDataList:
    logger = getLogger(__name__)
    if not dir_path.exists():
        ex = ValueError(f"Directory not found: {dir_path}")
        logger.error("", exc_info=ex)
        raise ex

    data_paths: Tuple[Language, str, Gender, str, Path] = []

    language_dirs = get_subfolders(dir_path)
    for language_dir in language_dirs:
        logger.info(f"Parsing {language_dir}...")
        male_dir = language_dir / "by_book" / "male"
        female_dir = language_dir / "by_book" / "female"
        language_name = get_basename(language_dir)
        lang = LANGUAGES[language_name]
        accent = ACCENTS[language_name]

        if male_dir.exists():
            speaker_paths = get_subfolders(male_dir)
            for speaker_path in speaker_paths:
                speaker_name = get_basename(speaker_path)
                book_paths = get_subfolders(speaker_path)
                for book_path in book_paths:
                    data_paths.append(
                        (lang, accent, Gender.MALE, speaker_name, book_path))

        if female_dir.exists():
            speaker_paths = get_subfolders(female_dir)
            for speaker_path in speaker_paths:
                speaker_name = get_basename(speaker_path)
                book_paths = get_subfolders(speaker_path)
                for book_path in book_paths:
                    data_paths.append(
                        (lang, accent, Gender.FEMALE, speaker_name, book_path))

    result = PreDataList()
    text_format = TextFormat.GRAPHEMES

    book_path: Path
    for lang, accent_name, gender, speaker_name, book_path in tqdm(data_paths):
        metadata_path = book_path / "metadata.csv"
        wav_dirpath = book_path / 'wavs'
        lines = read_lines(metadata_path)
        for line in lines:
            parts = line.split('|')
            basename = parts[0]
            # parts[1] contains years, in parts[2] the years are written out
            # ex. ['LJ001-0045', '1469, 1470;', 'fourteen sixty-nine, fourteen seventy;']
            wav_path = wav_dirpath / f'{basename}.wav'
            text = parts[2]
            text_en_symbols = tuple(text)

            if not wav_path.is_file():
                print(f"file does not exist: {wav_path}")
                # These files do not exist:
                # en_UK/by_book/female/elizabeth_klett/jane_eyre/wavs/jane_eyre_27_f000439.wav
                # en_UK/by_book/female/elizabeth_klett/jane_eyre/wavs/jane_eyre_27_f000441.wav
                # en_US/by_book/female/judy_bieber/the_master_key/wavs/the_master_key_05_f000135.wav
                # en_US/by_book/female/mary_ann/midnight_passenger/wavs/midnight_passenger_05_f000269.wav
                # en_US/by_book/female/mary_ann/northandsouth/wavs/northandsouth_40_f000069.wav
                continue

            entry = PreData(
                identifier=0,
                basename=basename,
                speaker_name=speaker_name,
                speaker_accent=accent_name,
                symbols=text_en_symbols,
                symbols_format=text_format,
                relative_audio_path=wav_path.relative_to(dir_path),
                speaker_gender=gender,
                symbols_language=lang,
            )

            result.append(entry)

    result.sort(key=sort_ds, reverse=False)
    result.set_identifiers()
    logger.info(f"Parsed {len(result)} entries.")

    return result
Ejemplo n.º 8
0
def parse(dir_path: Path) -> PreDataList:
  logger = getLogger(__name__)
  if not dir_path.exists():
    ex = ValueError(f"Directory not found: {dir_path}")
    logger.error("", exc_info=ex)
    raise ex

  readme_path = dir_path / README_FILE
  readme = read_lines(readme_path)
  readme = readme[34:58]
  speakers_dict = {}
  for speaker_details in readme:
    name, gender, accent, _, _ = speaker_details[1:-1].split("|")
    speakers_dict[name] = gender, accent

  speaker_folders = get_subfolders(dir_path)
  symbols_language = Language.ENG
  symbols_format = TextFormat.GRAPHEMES

  entries = PreDataList()

  logger.info("Parsing files...")
  for speaker_folder in tqdm(speaker_folders):
    speaker_name = get_basename(speaker_folder)
    if speaker_name not in speakers_dict.keys():
      logger.info(f"Skipping {speaker_name}")
      continue
    wavs = get_filepaths(speaker_folder / "wav")
    # count only 150, they do not contain good IPA
    # annotations = get_filepaths(speaker_folder / "annotation")
    textgrids = get_filepaths(speaker_folder / "textgrid")
    transcripts = get_filepaths(speaker_folder / "transcript")

    assert len(wavs) == len(textgrids) == len(transcripts)

    speaker_name = get_basename(speaker_folder)
    speaker_gender, speaker_accent = speakers_dict[speaker_name]
    gender = Gender.MALE if speaker_gender == "M" else Gender.FEMALE

    for i, (wav, transcript) in enumerate(zip(wavs, transcripts)):
      text_en = transcript.read_text()
      text_en = f"{text_en}."
      text_en_symbols = tuple(text_en)

      entry = PreData(
        identifier=0,
        basename=get_basename(wav),
        speaker_name=speaker_name,
        speaker_accent=speaker_accent,
        symbols=text_en_symbols,
        symbols_format=symbols_format,
        relative_audio_path=wav.relative_to(dir_path),
        speaker_gender=gender,
        symbols_language=symbols_language,
      )

      entries.append(entry)

  entries.sort(key=sort_arctic, reverse=False)
  entries.set_identifiers()

  logger.info(f"Parsed {len(entries)} entries from {len(speakers_dict)} speakers.")

  return entries