def steps(step): """ Go through steps """ if step == 1: session.clear() session["temp_dir"] = mkdtemp() temp_dir = session["temp_dir"] langs, lang_names = getLangs() return render_template( "upload.html", uploaded=uploaded_files(temp_dir), maps=[{"code": m, "name": lang_names[m]} for m in langs], ) elif step == 2: return render_template("preview.html") elif step == 3: if "audio" not in session or "text" not in session: log = "Sorry, it looks like something is wrong with your audio or text. Please try again" else: flags = ["--force-overwrite"] for option in ["--closed-captioning", "--save-temps", "--text-grid"]: if session["config"].get(option, False): flags.append(option) if session["text"].endswith("txt"): flags.append("--text-input") flags.append("--language") flags.append(session["config"]["lang"]) timestamp = str(int(datetime.now().timestamp())) output_base = "aligned" + timestamp args = ( ["readalongs", "align"] + flags + [ session["text"], session["audio"], os.path.join(session["temp_dir"], output_base), ] ) LOGGER.warning(args) _, audio_ext = os.path.splitext(session["audio"]) data = {"audio_ext": audio_ext, "base": output_base} if session["config"].get("show-log", False): log = run(args, capture_output=True, check=False) data["log"] = log else: run(args, check=False) data["audio_path"] = os.path.join( session["temp_dir"], output_base, output_base + audio_ext ) data["audio_fn"] = f"/file/{output_base}" + audio_ext data["text_path"] = os.path.join( session["temp_dir"], output_base, output_base + ".xml" ) data["text_fn"] = f"/file/{output_base}" + ".xml" data["smil_path"] = os.path.join( session["temp_dir"], output_base, output_base + ".smil" ) data["smil_fn"] = f"/file/{output_base}" + ".smil" return render_template("export.html", data=data) else: abort(404)
def mute_section(audio: AudioSegment, start: int, end: int) -> AudioSegment: """ Given an AudioSegment, reduce the gain between a given interval by 120db. Effectively, make it silent. Parameters ---------- audio : AudioSegment audio segment to mute start : int start timestamp of audio (ms) end : int end timestamp of audio (ms) Returns ------- AudioSegment A muted audio segment """ try: return audio[:start] + audio[start:end].apply_gain(-120) + audio[end:] except IndexError: LOGGER.error( f"Tried to mute audio between {start} and {end}, but audio is only {len(audio)}ms long. \ Returning unmuted audio instead.") return audio
def make_fsg(word_elements, filename): name = slugify(os.path.splitext(os.path.basename(filename))[0]) data = { "name": name, # If name includes special characters, pocketsphinx throws a RuntimeError: new_Decoder returned -1 "states": [], "num_states": 0, } for e in word_elements: if "id" not in e.attrib: # don't put in elements with no id continue if not e.text or not e.text.strip(): LOGGER.warning("No text in node %s", e.attrib["id"]) continue text = e.text.strip() # if not e.text.strip(): # don't put in elements with no text # continue data["states"].append( { "id": e.attrib["id"] if text else "", "current": data["num_states"], "next": data["num_states"] + 1, } ) data["num_states"] += 1 data["final_state"] = data["num_states"] data["num_states"] += 1 return chevron.render(FSG_TEMPLATE, data)
def test_align_removed(self): """Try aligning section with removed audio""" # Process Audio removed_segment = remove_section(self.noisy_segment, 1500, 2500) audio_output_path = os.path.join(self.tempdir, "removed_sample.mp3") with open(audio_output_path, "wb") as f: removed_segment.export(f) # Align input_text_path = os.path.join(self.data_dir, "audio_sample.txt") input_audio_path = audio_output_path flags = ["-l", "eng"] output_path = os.path.join(self.tempdir, "output_removed") process = self.align(input_text_path, input_audio_path, output_path, flags) if process.returncode != 0: LOGGER.error("Subprocess readalongs align failed: %s", process.stderr) # Check Result smilpath = Path(output_path) smil_files = smilpath.glob("*.smil") self.assertTrue( next(smil_files, False), "No *.smil files found; " "pip install --force-reinstall --upgrade might be required " "if dependencies changed.", )
def extract_section(audio: AudioSegment, start: Union[None, int], end: Union[None, int]) -> AudioSegment: """ Given an AudioSegment, extract and keep only the [start, end) interval Args: audio (AudioSegment): audio segment to extract a section from start (Union[None,int]): start timestamp of audio to extract (ms) (None means begining of audio) end (Union[None,int]): end timestamp of audio to extract (ms) (None means end of audio) Returns: AudioSegment: the extracted audio segment """ # Optimization: don't copy the data if we're extracting from None to None if start is None and end is None: return audio try: return audio[start:end] except IndexError: LOGGER.error( f"Tried to extract audio between {start} and {end}, but audio is only " f"{len(audio)}ms long. Returning whole audio instead.") return audio
def return_temp_file(fname): fn, ext = os.path.splitext(fname) LOGGER.warn(session["temp_dir"]) path = os.path.join(session["temp_dir"], fn, fname) if os.path.exists(path): return send_file(path) else: abort(404, "Sorry, we couldn't find that file.")
def test_generate_output_name(self): input_file = os.path.join(self.tempdir, "someinput.txt") copyfile(os.path.join(self.data_dir, "fra.txt"), input_file) results = self.runner.invoke(prepare, ["-l", "fra", input_file]) LOGGER.warning("Output: {}".format(results.output)) LOGGER.warning("Exception: {}".format(results.exception)) self.assertEqual(results.exit_code, 0) self.assertRegex(results.stdout, "Wrote.*someinput[.]xml") self.assertTrue(os.path.exists(os.path.join(self.tempdir, "someinput.xml")))
def load_tsv(input_path, labels): results = [] with open(input_path, "r", encoding="utf-8") as fin: for i, line in enumerate(fin, start=1): pieces = line.strip("\n").strip(" ").split("\t") if len(pieces) > len(labels): LOGGER.error("More columns than labels on line %s" % i) continue results.append(OrderedDict(zip(labels, pieces))) return results
def remove_section(audio: AudioSegment, start: int, end: int) -> AudioSegment: """ Given an AudioSement, remove the section between start (ms) and end (ms) """ try: return audio[:start] + audio[end:] except IndexError: LOGGER.error( f"Tried to remove audio between {start} and {end}, but audio is only " f"{len(audio)}ms long. Returning unchanged audio instead.") return audio
def tokenize_xml(xml): tokenizer = XMLTokenizer() xml = deepcopy(xml) # FIXME: different langs have different normalizations, is this necessary? unicode_normalize_xml(xml) words = xml.xpath(".//w") if words: LOGGER.info("Words (<w>) already present; skipping tokenization") return xml LOGGER.info("Words (<w>) not present; tokenizing") return tokenizer.add_word_children(xml)
def tokenize_xml(xml): """Returns a deep copy of xml with all words wrapped in a "w" XML element""" xml = deepcopy(xml) # FIXME: different langs have different normalizations, is this necessary? unicode_normalize_xml(xml) words = xml.xpath(".//w") if words: LOGGER.info("Words (<w>) already present; skipping tokenization") return xml LOGGER.info("Words (<w>) not present; tokenizing") return tokenize_xml_in_place(xml)
def join_section(audio: AudioSegment, audio_to_insert: AudioSegment, start: int): """ Given two AudioSegments, insert the second into the first at start (ms) """ try: return audio[:start] + audio_to_insert + audio[start:] except IndexError: LOGGER.error( f"Tried to insert audio at {start}, but audio is only {len(audio)}ms long. " "Returning unchanged audio instead.") return audio
def convert_word(word: str, lang: str): """Convert one individual word through the specified cascade of g2p mappings. Args: word (str): input word to map through g2p lang (str): the language code to use to attempt the g2p mapping Returns: g2p_text (str), valid(bool): - g2p_text is the word mapping from lang to output_orthography - valid is a flag indicating whether g2p conversion yielded valid output, which includes making sure IPA output was valid IPA and ARPABET output was valid ARPABET, at all intermediate steps as well as in the final output. """ if lang == "eng": # Hack to use old English LexiconG2P # Note: adding eng_ prefix to vars that are used in both blocks to make mypy # happy. Since the two sides of the if and in the same scope, it complains about # type checking otherwise. assert output_orthography == "eng-arpabet" eng_converter = getLexiconG2P( os.path.join(os.path.dirname(LEXICON_PATH), "cmu_sphinx.metadata.json") ) try: eng_text, _ = eng_converter.convert(word) eng_valid = is_arpabet(eng_text) except KeyError as e: if verbose_warnings: LOGGER.warning(f'Could not g2p "{word}" as English: {e.args[0]}') eng_text = word eng_valid = False return eng_text, eng_valid else: try: converter = make_g2p(lang, output_orthography) except InvalidLanguageCode as e: raise ValueError( f'Could not g2p "{word}" as "{lang}": invalid language code. ' f"Use one of {getLangs()[0]}" ) from e except NoPath as e: raise ValueError( f'Count not g2p "{word}" as "{lang}": no path to "{output_orthography}". ' f"Use one of {getLangs()[0]}" ) from e tg = converter(word) text = tg.output_string.strip() valid = converter.check(tg, shallow=True) if not valid and verbose_warnings: converter.check(tg, shallow=False, display_warnings=verbose_warnings) return text, valid
def align(self, input_text_path, input_audio_path, output_path, flags): args = [ "readalongs", "align", input_text_path, input_audio_path, output_path, ] + flags LOGGER.info( f"Aligning {input_text_path} and {input_audio_path}, outputting to {output_path}" ) return run(args, capture_output=True)
def get_sequences(xml, xml_filename, unit="w", anchor="anchor") -> List[WordSequence]: """Return the list of anchor-separated word sequences in xml Args: xml (etree): xml structure in which to search for words and anchors xml_filename (str): filename, used for error messages only unit (str): element tag of the word units anchor (str): element tag of the anchors Returns: List[WordSequence]: all sequences found in xml """ sequences: List[WordSequence] = [] start = None words = [] all_good = True for e in xml.xpath(f".//{unit} | .//{anchor}"): if e.tag == unit: words.append(e) else: assert e.tag == anchor try: end = parse_time(e.attrib["time"]) except KeyError: LOGGER.error( f'Invalid {anchor} element in {xml_filename}: missing "time" attribute' ) all_good = False continue except ValueError as err: LOGGER.error( f'Invalid {anchor} element in {xml_filename}: invalid "time" ' f'attribute "{e.attrib["time"]}": {err}') all_good = False continue if words: sequences.append(WordSequence(start, end, words)) words = [] start = end if words: sequences.append(WordSequence(start, None, words)) if not all_good: raise RuntimeError( f"Could not parse all anchors in {xml_filename}, please make sure each anchor " 'element is properly formatted, e.g., <anchor time="34.5s"/>. Aborting.' ) return sequences
def align(self, input_text_path, input_audio_path, output_path, flags): """Wrapper for invoking readalongs align via subprocess.run""" args = [ "readalongs", "align", input_text_path, input_audio_path, output_path, ] + flags LOGGER.info( f"Aligning {input_text_path} and {input_audio_path}, outputting to {output_path}" ) return run(args, capture_output=True, check=False, encoding="utf-8")
def encode_from_path(path: str) -> str: """Encode file from bytes to b64 string with data and mime signature Args: path (str): path to file Returns: str: base64 string with data and mime signature """ import requests # Defer expensive import with open(path, "rb") as f: path_bytes = f.read() if path.endswith("xml"): root = etree.fromstring(path_bytes) for img in root.xpath("//graphic"): url = img.get("url") res = requests.get(url) if url.startswith("http") else None mime = guess_type(url) if os.path.exists(url): with open(url, "rb") as f: img_bytes = f.read() img_b64 = str(b64encode(img_bytes), encoding="utf8") elif res and res.status_code == 200: img_b64 = str(b64encode(res.content), encoding="utf8") else: LOGGER.warn( f"The image declared at {url} could not be found. Please check that it exists." ) continue img.attrib["url"] = f"data:{mime[0]};base64,{img_b64}" path_bytes = etree.tostring(root) b64 = str(b64encode(path_bytes), encoding="utf8") mime = guess_type(path) if path.endswith( ".m4a" ): # hack to get around guess_type choosing the wrong mime type for .m4a files # TODO: Check other popular audio formats, .wav, .mp3, .ogg, etc... mime_type = "audio/mp4" elif mime[0]: mime_type = mime[0].replace( "video", "audio" ) # Hack: until we properly extract audio from video files, force any video-based mime type to be read as audio else: mime_type = "application" LOGGER.warn( f"We could not guess the mime type of file at {path}, we will try the generic mime type 'application', but this might not work with some files" ) return f"data:{mime_type};base64,{b64}"
def make_dict(word_elements, input_filename, unit="m"): data = {"items": []} nwords = 0 for e in word_elements: if "id" not in e.attrib: LOGGER.error("%s-type element without id in file %s" % (unit, input_filename)) text = e.attrib.get("ARPABET", "").strip() if not text: continue nwords += 1 data["items"].append({"id": e.attrib["id"], "pronunciation": text}) if nwords == 0: raise RuntimeError("No words in dictionary!") return chevron.render(DICT_TEMPLATE, data)
def process_src_attrib(src_text, id_prefix, mimetypes): filename = src_text.split("#")[0] filename_without_ext, ext = os.path.splitext(filename) ext = ext.strip(".") if ext not in mimetypes: LOGGER.warning("Unknown extension in SMIL: %s", ext) return None entry = { "origin_path": filename, "dest_path": filename, "ext": ext.lower(), "id": id_prefix + os.path.basename(filename_without_ext), "mimetype": mimetypes[ext], } return entry
def test_align_sample(self): """ Sanity check that test audio should align """ # Align input_text_path = os.path.join(self.data_path, "audio_sample.txt") input_audio_path = os.path.join(self.data_path, "audio_sample.ogg") flags = ["-i", "-l", "eng"] output_path = os.path.join(self.tempdir, "output") log = self.align(input_text_path, input_audio_path, output_path, flags) LOGGER.info(str(log)) # Check Result smilpath = Path(output_path) smil_files = smilpath.glob("*.smil") self.assertGreaterEqual(len([x for x in smil_files]), 1) self.assertFalse("error" in str(log).lower())
class BasicTestCase(TestCase): """A Basic Unittest build block class that comes bundled with a temporary directory (tempdir), and access to an app runner (self.runner) """ LOGGER.setLevel("DEBUG") data_dir = os.path.join(os.path.dirname(__file__), "data") # Set this to True to keep the temp dirs after running, for manual inspection # but please don't push a commit setting this to True! keep_temp_dir_after_running = False def setUp(self): """Create a temporary directory, self.tempdir, and a test runner, self.runner""" app.logger.setLevel("DEBUG") self.runner = app.test_cli_runner() tempdir_prefix = f"tmpdir_{type(self).__name__}_" if not self.keep_temp_dir_after_running: self.tempdirobj = tempfile.TemporaryDirectory( prefix=tempdir_prefix, dir=".") self.tempdir = self.tempdirobj.name else: # Alternative tempdir code keeps it after running, for manual inspection: self.tempdir = tempfile.mkdtemp(prefix=tempdir_prefix, dir=".") print("tmpdir={}".format(self.tempdir)) def tearDown(self): """Clean up the temporary directory""" if not self.keep_temp_dir_after_running: self.tempdirobj.cleanup()
class TestForceAlignment(unittest.TestCase): LOGGER.setLevel("DEBUG") data_dir = os.path.join(os.path.dirname(__file__), "data") def testAlign(self): xml_path = os.path.join(self.data_dir, "ej-fra.xml") wav_path = os.path.join(self.data_dir, "ej-fra.m4a") results = align_audio(xml_path, wav_path, unit="w") # Verify that the same IDs are in the output converted_path = os.path.join(self.data_dir, "ej-fra-converted.xml") xml = etree.parse(converted_path).getroot() words = results["words"] xml_words = xml.xpath(".//w") self.assertEqual(len(words), len(xml_words)) for w, xw in zip(words, xml_words): self.assertEqual(xw.attrib["id"], w["id"]) def testAlignText(self): txt_path = os.path.join(self.data_dir, "ej-fra.txt") wav_path = os.path.join(self.data_dir, "ej-fra.m4a") # tempfh, temp_fn = create_input_xml(txt_path, text_language='git', save_temps="unit") tempfh, temp_fn = create_input_tei(input_file_name=txt_path, text_language="fra", save_temps=None) results = align_audio(temp_fn, wav_path, unit="w", save_temps=None) # Verify that the same IDs are in the output converted_path = os.path.join(self.data_dir, "ej-fra-converted.xml") xml = etree.parse(converted_path).getroot() words = results["words"] xml_words = xml.xpath(".//w") self.assertEqual(len(words), len(xml_words)) for w, xw in zip(words, xml_words): self.assertEqual(xw.attrib["id"], w["id"])
def run_tests(suite): if suite == "e2e": suite = TestSuite(e2e_tests) elif suite == "dev": suite = TestSuite(indices_tests + other_tests + e2e_tests) elif suite == "prod" or suite == "all": suite = loader.discover(os.path.dirname(__file__)) elif suite == "other": suite = TestSuite(other_tests) else: LOGGER.error( "Sorry, you need to select a Test Suite to run, like 'dev' or 'prod'" ) runner = TextTestRunner(verbosity=3) return runner.run(suite)
def run_tests(suite): """Run the specified test suite""" if suite == "e2e": suite = TestSuite(e2e_tests) elif suite == "dev": suite = TestSuite(indices_tests + other_tests + e2e_tests) elif suite in ("prod", "all"): suite = loader.discover(os.path.dirname(__file__)) elif suite == "other": suite = TestSuite(other_tests) else: LOGGER.error("Sorry, you need to select a Test Suite to run, one of: " "dev, all (or prod), e2e, other") sys.exit(1) runner = TextTestRunner(verbosity=3) return runner.run(suite)
def create_web_component_html( text_path: str, alignment_path: str, audio_path: str, title="Title goes here", header="Header goes here", subheader="Subheader goes here", theme="light", ) -> str: import requests # Defer expensive import js = requests.get(JS_BUNDLE_URL) fonts = requests.get(FONTS_BUNDLE_URL) if js.status_code != 200: LOGGER.warn( f"Sorry, the JavaScript bundle that is supposed to be at {JS_BUNDLE_URL} returned a {js.status_code}. Your ReadAlong will be bundled using a version that may not be up-to-date. Please check your internet connection." ) with open(os.path.join(os.path.dirname(__file__), "bundle.js"), encoding="utf8") as f: js_raw = f.read() else: js_raw = js.text if fonts.status_code != 200: LOGGER.warn( f"Sorry, the fonts bundle that is supposed to be at {FONTS_BUNDLE_URL} returned a {fonts.status_code}. Your ReadAlong will be bundled using a version that may not be up-to-date. Please check your internet connection." ) with open(os.path.join(os.path.dirname(__file__), "bundle.css"), encoding="utf8") as f: fonts_raw = f.read() else: fonts_raw = fonts.text return BASIC_HTML.format( text=encode_from_path(text_path), alignment=encode_from_path(alignment_path), audio=encode_from_path(audio_path), js=js_raw, fonts=fonts_raw, title=title, header=header, subheader=subheader, theme=theme, )
def test_align_removed(self): """ Try aligning section with removed audio """ # Process Audio removed_segment = remove_section(self.noisy_segment, 1500, 2500) audio_output_path = os.path.join(self.tempdir, "removed_sample.mp3") removed_segment.export(audio_output_path) # Align input_text_path = os.path.join(self.data_path, "audio_sample.txt") input_audio_path = audio_output_path flags = ["-i", "-l", "eng"] output_path = os.path.join(self.tempdir, "output_removed") log = self.align(input_text_path, input_audio_path, output_path, flags) LOGGER.info(str(log)) # Check Result smilpath = Path(output_path) smil_files = smilpath.glob("*.smil") self.assertGreaterEqual(len([x for x in smil_files]), 1) self.assertFalse("error" in str(log).lower())
class TestTokenizeCli(TestCase): LOGGER.setLevel("DEBUG") data_dir = os.path.join(os.path.dirname(__file__), "data") def setUp(self): app.logger.setLevel("DEBUG") self.runner = app.test_cli_runner() self.tempdirobj = tempfile.TemporaryDirectory( prefix="test_tokenize_cli_tmpdir", dir=".") self.tempdir = self.tempdirobj.name # Alternative tempdir code keeps it after running, for manual inspection: # self.tempdir = tempfile.mkdtemp(prefix="test_tokenize_cli_tmpdir", dir=".") # print('tmpdir={}'.format(self.tempdir)) self.xmlfile = os.path.join(self.tempdir, "fra.xml") _ = self.runner.invoke(prepare, [ "-l", "fra", os.path.join(self.data_dir, "fra.txt"), self.xmlfile ]) def tearDown(self): self.tempdirobj.cleanup() def test_invoke_tok(self): results = self.runner.invoke( tokenize, [self.xmlfile, os.path.join(self.tempdir, "delme")]) self.assertEqual(results.exit_code, 0) self.assertTrue(os.path.exists(os.path.join(self.tempdir, "delme.xml"))) def test_generate_output_name(self): results = self.runner.invoke(tokenize, [self.xmlfile]) self.assertEqual(results.exit_code, 0) self.assertTrue( os.path.exists(os.path.join(self.tempdir, "fra.tokenized.xml"))) def test_with_stdin(self): with io.open(self.xmlfile) as f: inputtext = f.read() results = self.runner.invoke(tokenize, "-", input=inputtext) self.assertEqual(results.exit_code, 0) self.assertIn("<s><w>Ceci</w> <w>est</w> <w>une</w> <w>phrase</w>", results.output) def test_file_already_exists(self): results = self.runner.invoke(tokenize, [self.xmlfile, self.xmlfile]) self.assertNotEqual(results.exit_code, 0) self.assertIn("use -f to overwrite", results.output) def test_bad_input(self): results = self.runner.invoke(tokenize, "- -", input="this is not XML!") self.assertNotEqual(results.exit_code, 0) self.assertIn("Error parsing", results.output)
def test_align_sample(self): """Sanity check that test audio should align""" # Align input_text_path = os.path.join(self.data_dir, "audio_sample.txt") input_audio_path = os.path.join(self.data_dir, "audio_sample.ogg") flags = ["-l", "eng"] output_path = os.path.join(self.tempdir, "output") process = self.align(input_text_path, input_audio_path, output_path, flags) if process.returncode != 0: LOGGER.error("Subprocess readalongs align failed: %s", process.stderr) # Check Result smilpath = Path(output_path) smil_files = smilpath.glob("*.smil") self.assertTrue( next(smil_files, False), "No *.smil files found; " "pip install --force-reinstall --upgrade might be required " "if dependencies changed.", )
def __init__(self, metadata_path): self.metadata = load_json(metadata_path) self.in_lang = self.metadata["in_metadata"]["lang"] self.out_lang = self.metadata["out_metadata"]["lang"] dirname = os.path.dirname(metadata_path) if "src" not in self.metadata: LOGGER.error("File %s does not specify a source document", metadata_path) return self.src_path = os.path.join(dirname, self.metadata["src"]) self.entries = defaultdict(list) if "src_format" not in self.metadata: LOGGER.error( "File %s lacking a source format ('src_format') attribute", metadata_path, ) return if self.metadata["src_format"] not in LEXICON_LOADERS: LOGGER.error( "File %s references an unknown lexicon format: %s", metadata_path, self.metadata["src_format"], ) self.loader = LEXICON_LOADERS[self.metadata["src_format"]]
def create_epub(input_path, output_path, unpacked=False): if os.path.isdir(output_path): shutil.rmtree(output_path) ensure_dirs(output_path) input_dirname = os.path.dirname(input_path) if unpacked: os.mkdir(output_path) copy = copy_file_to_dir save = save_txt_to_dir else: copy = copy_file_to_zip save = save_txt_zip # mimetype file copy(output_path, MIMETYPE_ORIGIN_PATH, MIMETYPE_DEST_PATH) # container.xml file container_template = load_txt(CONTAINER_ORIGIN_PATH) container_txt = pystache.render(container_template, {"package_path": PACKAGE_DEST_PATH}) save(output_path, CONTAINER_DEST_PATH, container_txt) # the SMIL and all the files referenced in the SMIL package_data = extract_files_from_SMIL(input_path) package_template = load_txt(PACKAGE_ORIGIN_PATH) package_txt = pystache.render(package_template, package_data) save(output_path, PACKAGE_DEST_PATH, package_txt) for entry in package_data["media"]: origin_path = os.path.join(input_dirname, entry["origin_path"]) if not os.path.exists(origin_path): LOGGER.warning("Cannot find file %s to copy into EPUB file", origin_path) continue dest_path = os.path.join(EPUB_PATH, entry["dest_path"]) copy(output_path, origin_path, dest_path) # CSS file copy(output_path, STYLESHEET_ORIGIN_PATH, STYLESHEET_DEST_PATH)