def test_normalize_text(inp): normalized = normalize_text(inp) if hasattr(normalized, "isascii"): # Only exists on python 3.7+ assert normalized.isascii() # this will raise an exception if the text is not normalized normalized.encode("ascii")
def sample_manifest(sample_data): audio_files = get_files(sample_data / "LapsBM-F004", ".wav") manifest = sample_data / "test_example_manifest.json" with open(manifest, "w", encoding="utf8") as f: for fil in audio_files: data = { "audio_filepath": str(fil.resolve()), "duration": audio_len(fil), "text": normalize_text(fil.with_suffix(".txt").read_text().strip()), } json.dump(data, f) f.write("\n") return manifest
def test_normalize_text_specific_inputs(): assert normalize_text("áàâã") == "aaaa" assert normalize_text("ç") == "c"
def preprocess_text(self, text: str) -> str: normalized = normalize_text(text) lower = lower_text(normalized) return lower