def check_ffmpeg(cls): """ Check whether ``ffmpeg`` can be called. Return ``True`` on failure and ``False`` on success. :rtype: bool """ try: from aeneas.ffmpegwrapper import FFMPEGWrapper input_file_path = gf.absolute_path(u"tools/res/audio.mp3", __file__) handler, output_file_path = gf.tmp_file(suffix=u".wav") converter = FFMPEGWrapper() result = converter.convert(input_file_path, output_file_path) gf.delete_file(handler, output_file_path) if result: gf.print_success(u"ffmpeg OK") return False except: pass gf.print_error(u"ffmpeg ERROR") gf.print_info( u" Please make sure you have ffmpeg installed correctly") gf.print_info( u" and that its path is in your PATH environment variable") return True
def check_espeak(cls): """ Check whether ``espeak`` can be called. Return ``True`` on failure and ``False`` on success. :rtype: bool """ try: from aeneas.espeakwrapper import ESPEAKWrapper text = u"From fairest creatures we desire increase," language = u"eng" handler, output_file_path = gf.tmp_file(suffix=u".wav") espeak = ESPEAKWrapper() result = espeak.synthesize_single(text, language, output_file_path) gf.delete_file(handler, output_file_path) if result: gf.print_success(u"espeak OK") return False except: pass gf.print_error(u"espeak ERROR") gf.print_info( u" Please make sure you have espeak installed correctly") gf.print_info( u" and that its path is in your PATH environment variable") gf.print_info( u" You might also want to check that the espeak-data directory") gf.print_info( u" is set up correctly, for example, it has the correct permissions" ) return True
def synthesize_multiple(self, text_file, ofp=None, quit_after=None, backwards=False, zero_length=False): if ofp is None: handler, output_file_path = gf.tmp_file(suffix=".wav") else: handler = None output_file_path = ofp try: rconf = RuntimeConfiguration() rconf[RuntimeConfiguration.TTS] = u"festival" rconf[RuntimeConfiguration.TTS_PATH] = u"text2wave" tts_engine = FESTIVALWrapper(rconf=rconf) anchors, total_time, num_chars = tts_engine.synthesize_multiple( text_file, output_file_path, quit_after, backwards) gf.delete_file(handler, output_file_path) if zero_length: self.assertEqual(total_time, 0.0) else: self.assertGreater(total_time, 0.0) except (OSError, TypeError, UnicodeDecodeError, ValueError) as exc: gf.delete_file(handler, output_file_path) raise exc
def inner(c_ext, cew_subprocess): if ofp is None: handler, output_file_path = gf.tmp_file(suffix=".wav") else: handler = None output_file_path = ofp try: rconf = RuntimeConfiguration() rconf[RuntimeConfiguration.C_EXTENSIONS] = c_ext rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED] = cew_subprocess tts_engine = ESPEAKWrapper(rconf=rconf) anchors, total_time, num_chars = tts_engine.synthesize_multiple( text_file, output_file_path, quit_after, backwards ) gf.delete_file(handler, output_file_path) if zero_length: self.assertEqual(total_time, 0.0) else: self.assertGreater(total_time, 0.0) except (OSError, TypeError, UnicodeDecodeError, ValueError) as exc: gf.delete_file(handler, output_file_path) raise exc
def inner(c_ext, cew_subprocess, cache): if ofp is None: handler, output_file_path = gf.tmp_file(suffix=".wav") else: handler = None output_file_path = ofp try: rconf = RuntimeConfiguration() rconf[RuntimeConfiguration.TTS] = self.TTS rconf[RuntimeConfiguration.TTS_PATH] = self.TTS_PATH rconf[RuntimeConfiguration.C_EXTENSIONS] = c_ext rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED] = cew_subprocess rconf[RuntimeConfiguration.TTS_CACHE] = cache tts_engine = self.TTS_CLASS(rconf=rconf) anchors, total_time, num_chars = tts_engine.synthesize_multiple( text_file, output_file_path, quit_after, backwards ) gf.delete_file(handler, output_file_path) if cache: tts_engine.clear_cache() if zero_length: self.assertEqual(total_time, 0.0) else: self.assertGreater(total_time, 0.0) except (OSError, TypeError, UnicodeDecodeError, ValueError) as exc: gf.delete_file(handler, output_file_path) if (cache) and (tts_engine is not None): tts_engine.clear_cache() with self.assertRaises(expected_exc): raise exc
def check_espeak(cls): """ Check whether ``espeak`` can be called. Return ``True`` on failure and ``False`` on success. :rtype: bool """ try: from aeneas.textfile import TextFile from aeneas.textfile import TextFragment from aeneas.ttswrappers.espeakttswrapper import ESPEAKTTSWrapper text = u"From fairest creatures we desire increase," text_file = TextFile() text_file.add_fragment(TextFragment(language=u"eng", lines=[text], filtered_lines=[text])) handler, output_file_path = gf.tmp_file(suffix=u".wav") ESPEAKTTSWrapper().synthesize_multiple(text_file, output_file_path) gf.delete_file(handler, output_file_path) gf.print_success(u"espeak OK") return False except: pass gf.print_error(u"espeak ERROR") gf.print_info(u" Please make sure you have espeak installed correctly") gf.print_info(u" and that its path is in your PATH environment variable") gf.print_info(u" You might also want to check that the espeak-data directory") gf.print_info(u" is set up correctly, for example, it has the correct permissions") return True
def inner(c_ext, cew_subprocess, cache): if ofp is None: handler, output_file_path = gf.tmp_file(suffix=".wav") else: handler = None output_file_path = ofp try: rconf = RuntimeConfiguration() rconf[RuntimeConfiguration.TTS] = self.TTS rconf[RuntimeConfiguration.TTS_PATH] = self.TTS_PATH rconf[RuntimeConfiguration.C_EXTENSIONS] = c_ext rconf[RuntimeConfiguration. CEW_SUBPROCESS_ENABLED] = cew_subprocess rconf[RuntimeConfiguration.TTS_CACHE] = cache tts_engine = self.TTS_CLASS(rconf=rconf) anchors, total_time, num_chars = tts_engine.synthesize_multiple( text_file, output_file_path, quit_after, backwards) gf.delete_file(handler, output_file_path) if cache: tts_engine.clear_cache() if zero_length: self.assertEqual(total_time, 0.0) else: self.assertGreater(total_time, 0.0) except (OSError, TypeError, UnicodeDecodeError, ValueError) as exc: gf.delete_file(handler, output_file_path) if (cache) and (tts_engine is not None): tts_engine.clear_cache() with self.assertRaises(expected_exc): raise exc
def check_espeak(cls): """ Check whether ``espeak`` can be called. Return ``True`` on failure and ``False`` on success. :rtype: bool """ try: from aeneas.espeakwrapper import ESPEAKWrapper text = u"From fairest creatures we desire increase," language = u"eng" handler, output_file_path = gf.tmp_file(suffix=u".wav") espeak = ESPEAKWrapper() result = espeak.synthesize_single( text, language, output_file_path ) gf.delete_file(handler, output_file_path) if result: gf.print_success(u"espeak OK") return False except: pass gf.print_error(u"espeak ERROR") gf.print_info(u" Please make sure you have espeak installed correctly") gf.print_info(u" and that its path is in your PATH environment variable") gf.print_info(u" You might also want to check that the espeak-data directory") gf.print_info(u" is set up correctly, for example, it has the correct permissions") return True
def synthesize_multiple(self, text_file, ofp=None, quit_after=None, backwards=False, zero_length=False): if ofp is None: handler, output_file_path = gf.tmp_file(suffix=".wav") else: handler = None output_file_path = ofp try: rconf = RuntimeConfiguration() rconf[RuntimeConfiguration.TTS] = u"festival" rconf[RuntimeConfiguration.TTS_PATH] = u"text2wave" tts_engine = FESTIVALWrapper(rconf=rconf) anchors, total_time, num_chars = tts_engine.synthesize_multiple( text_file, output_file_path, quit_after, backwards ) gf.delete_file(handler, output_file_path) if zero_length: self.assertEqual(total_time, 0.0) else: self.assertGreater(total_time, 0.0) except (OSError, TypeError, UnicodeDecodeError, ValueError) as exc: gf.delete_file(handler, output_file_path) raise exc
def test_close_file_handler(self): handler, path = gf.tmp_file() self.assertTrue(gf.file_exists(path)) gf.close_file_handler(handler) self.assertTrue(gf.file_exists(path)) gf.delete_file(handler, path) self.assertFalse(gf.file_exists(path))
def test_read_file_bytes(self): handler, path = gf.tmp_file() with io.open(path, "w", encoding="utf-8") as tmp_file: tmp_file.write(u"Foo bar") contents = gf.read_file_bytes(path) self.assertTrue(gf.is_bytes(contents)) self.assertEqual(len(contents), 7) gf.delete_file(handler, path)
def test_write(self): audiofile = self.load(self.AUDIO_FILE_WAVE, rs=True) data = audiofile.audio_samples handler, output_file_path = gf.tmp_file(suffix=".wav") audiofile.write(output_file_path) audiocopy = self.load(output_file_path) datacopy = audiocopy.audio_samples self.assertTrue((datacopy == data).all()) gf.delete_file(handler, output_file_path)
def _loop_use_cache(self, helper_function, num, fragment): """ Synthesize all fragments using the cache """ self.log([u"Examining fragment %d (cache)...", num]) fragment_info = (fragment.language, fragment.filtered_text) if self.cache.is_cached(fragment_info): self.log(u"Fragment cached: retrieving audio data from cache") # read data from file, whose path is in the cache file_handler, file_path = self.cache.get(fragment_info) self.log([u"Reading cached fragment at '%s'...", file_path]) succeeded, data = self._read_audio_data(file_path) if not succeeded: self.log_crit( u"An unexpected error occurred while reading cached audio file" ) return (False, None) self.log([u"Reading cached fragment at '%s'... done", file_path]) else: self.log(u"Fragment not cached: synthesizing and caching") # creating destination file file_info = gf.tmp_file( suffix=u".cache.wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) file_handler, file_path = file_info self.log([u"Synthesizing fragment to '%s'...", file_path]) # synthesize and get the duration of the output file voice_code = self._language_to_voice_code(fragment.language) self.log(u"Calling helper function") succeeded, data = helper_function(text=fragment.filtered_text, voice_code=voice_code, output_file_path=file_path, return_audio_data=True) # check output if not succeeded: self.log_crit( u"An unexpected error occurred in helper_function") return (False, None) self.log([u"Synthesizing fragment to '%s'... done", file_path]) duration, sr_nu, enc_nu, samples = data if duration > 0: self.log(u"Fragment has > 0 duration, adding it to cache") self.cache.add(fragment_info, file_info) self.log(u"Added fragment to cache") else: self.log(u"Fragment has zero duration, not adding it to cache") self.log([ u"Closing file handler for cached output file path '%s'", file_path ]) gf.close_file_handler(file_handler) self.log([u"Examining fragment %d (cache)... done", num]) return (True, data)
def test_output_sync_map(self): task = Task() task.configuration = TaskConfiguration() task.configuration["language"] = Language.ENG task.configuration["o_format"] = SyncMapFormat.TXT task.sync_map = self.dummy_sync_map() handler, output_file_path = gf.tmp_file(suffix=".txt") task.sync_map_file_path_absolute = output_file_path path = task.output_sync_map_file() self.assertIsNotNone(path) self.assertEqual(path, output_file_path) gf.delete_file(handler, output_file_path)
def test_compress_file(self): input_path = self.FILES["unpacked"]["path"] for key in self.FILES: fmt = self.FILES[key]["format"] if fmt != ContainerFormat.UNPACKED: handler, output_path = gf.tmp_file(suffix="." + fmt) cont = Container(output_path, fmt) cont.compress(input_path) self.assertTrue(os.path.isfile(output_path)) copy = Container(output_path, fmt) self.assertEqual(copy.entries, self.EXPECTED_ENTRIES) gf.delete_file(handler, output_path)
def inner(c_ext, cew_subprocess): handler, output_file_path = gf.tmp_file(suffix=".wav") tfl = TextFile(gf.absolute_path(path, __file__), TextFileFormat.PLAIN) tfl.set_language(Language.ENG) synth = Synthesizer(logger=logger) synth.rconf[RuntimeConfiguration.C_EXTENSIONS] = c_ext synth.rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED] = cew_subprocess result = synth.synthesize(tfl, output_file_path, quit_after=quit_after, backwards=backwards) gf.delete_file(handler, output_file_path) self.assertEqual(len(result[0]), expected) if expected2 is not None: self.assertAlmostEqual(result[1], expected2, places=0)
def execute(self, config_string, audio_path, text_path): handler, tmp_path = gf.tmp_file() task = Task(config_string) task.audio_file_path_absolute = gf.absolute_path(audio_path, __file__) task.text_file_path_absolute = gf.absolute_path(text_path, __file__) executor = ExecuteTask(task) executor.execute() task.sync_map_file_path_absolute = tmp_path result_path = task.output_sync_map_file() self.assertIsNotNone(result_path) self.assertEqual(result_path, tmp_path) self.assertGreater(len(gf.read_file_bytes(result_path)), 0) gf.delete_file(handler, tmp_path)
def perform_run(self, audio_file_path, text_file_path, config_string, rconf_string): output_file_handler, output_file_path = gf.tmp_file() executor = ExecuteTaskCLI(use_sys=False) verbose = "-v" if self.verbose else "" executor.run(arguments=[ "dummy placeholder for aeneas.tools.execute_task", audio_file_path, text_file_path, config_string, output_file_path, "-r=\"%s\"" % rconf_string, verbose ]) gf.delete_file(output_file_handler, output_file_path) return executor.logger
def test_cew_synthesize_single(self): handler, output_file_path = gf.tmp_file(suffix=".wav") try: import aeneas.cew.cew sr, begin, end = aeneas.cew.cew.synthesize_single( output_file_path, u"en", # NOTE cew requires the actual eSpeak voice code u"Dummy" ) self.assertEqual(sr, 22050) self.assertEqual(begin, 0) self.assertGreater(end, 0) except ImportError: pass gf.delete_file(handler, output_file_path)
def synthesize_and_clean(text, voice_code): """ Synthesize a single fragment via subprocess, and immediately remove the temporary file. :rtype: tuple (duration, sample_rate, encoding, samples) """ self.log(u"Synthesizing text...") handler, tmp_destination = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) result, data = self._synthesize_single_subprocess( text=(text + u" "), voice_code=voice_code, output_file_path=tmp_destination ) self.log([u"Removing temporary file '%s'", tmp_destination]) gf.delete_file(handler, tmp_destination) self.log(u"Synthesizing text... done") return data
def synthesize_single(self, text, language, ofp=None, zero_length=False): if ofp is None: handler, output_file_path = gf.tmp_file(suffix=".wav") else: handler = None output_file_path = ofp try: rconf = RuntimeConfiguration() rconf[RuntimeConfiguration.TTS] = u"festival" rconf[RuntimeConfiguration.TTS_PATH] = u"text2wave" tts_engine = FESTIVALWrapper(rconf=rconf) result = tts_engine.synthesize_single(text, language, output_file_path) gf.delete_file(handler, output_file_path) if zero_length: self.assertEqual(result, 0) else: self.assertGreater(result, 0) except (OSError, TypeError, UnicodeDecodeError, ValueError) as exc: gf.delete_file(handler, output_file_path) raise exc
def inner(c_ext, cew_subprocess): if ofp is None: handler, output_file_path = gf.tmp_file(suffix=".wav") else: handler = None output_file_path = ofp try: rconf = RuntimeConfiguration() rconf[RuntimeConfiguration.C_EXTENSIONS] = c_ext rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED] = cew_subprocess tts_engine = ESPEAKWrapper(rconf=rconf) result = tts_engine.synthesize_single(text, language, output_file_path) gf.delete_file(handler, output_file_path) if zero_length: self.assertEqual(result, 0) else: self.assertGreater(result, 0) except (OSError, TypeError, UnicodeDecodeError, ValueError) as exc: gf.delete_file(handler, output_file_path) raise exc
def _synthesize(self, text_file): """ Synthesize text into a WAVE file. Return a tuple consisting of: 1. the handler of the generated audio file 2. the path of the generated audio file 3. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` 4. a tuple describing the format of the audio file :param text_file: the text to be synthesized :type text_file: :class:`~aeneas.textfile.TextFile` :rtype: tuple (handler, string, list) """ handler, path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) result = self.synthesizer.synthesize(text_file, path) return (handler, path, result[0], self.synthesizer.output_audio_format)
def _compose_output_file_path(self, extension, output_file_path=None): """ If ``output_file_path`` is given, use it. Otherwise (``output_file_path`` is ``None``), create a temporary file with the correct extension. """ self.log(u"Determining output file path...") if output_file_path is None: self.log(u"output_file_path is None: creating temp file") handler, output_file_path = gf.tmp_file( root=self.rconf[RuntimeConfiguration.TMP_PATH], suffix=(".%s" % extension) ) gf.delete_file(handler, output_file_path) else: self.log(u"output_file_path is not None: cheking that file can be written") if not gf.file_can_be_written(output_file_path): self.log_exc(u"Path '%s' cannot be written. Wrong permissions?" % (output_file_path), None, True, OSError) self.log(u"Determining output file path... done") self.log([u"Output file path is '%s'", output_file_path]) return output_file_path
def test_cew_synthesize_multiple_lang(self): handler, output_file_path = gf.tmp_file(suffix=".wav") try: c_quit_after = 0.0 c_backwards = 0 c_text = [ (u"en", u"Dummy 1"), # NOTE cew requires the actual eSpeak voice code (u"it", u"Segnaposto 2" ), # NOTE cew requires the actual eSpeak voice code (u"en", u"Dummy 3"), # NOTE cew requires the actual eSpeak voice code ] import aeneas.cew.cew sr, sf, intervals = aeneas.cew.cew.synthesize_multiple( output_file_path, c_quit_after, c_backwards, c_text) self.assertEqual(sr, 22050) self.assertEqual(sf, 3) self.assertEqual(len(intervals), 3) except ImportError: pass gf.delete_file(handler, output_file_path)
def test_cew_synthesize_multiple_lang(self): handler, output_file_path = gf.tmp_file(suffix=".wav") try: c_quit_after = 0.0 c_backwards = 0 c_text = [ (u"en", u"Dummy 1"), # NOTE cew requires the actual eSpeak voice code (u"it", u"Segnaposto 2"), # NOTE cew requires the actual eSpeak voice code (u"en", u"Dummy 3"), # NOTE cew requires the actual eSpeak voice code ] import aeneas.cew.cew sr, sf, intervals = aeneas.cew.cew.synthesize_multiple( output_file_path, c_quit_after, c_backwards, c_text ) self.assertEqual(sr, 22050) self.assertEqual(sf, 3) self.assertEqual(len(intervals), 3) except ImportError: pass gf.delete_file(handler, output_file_path)
def _synthesize(self, text_file): """ Synthesize text into a WAVE file. Return: 1. handler of the generated wave file 2. path of the generated wave file 3. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` 4. if the synthesizer produced a PCM16 mono WAVE file :param synthesizer: the synthesizer to use :type synthesizer: :class:`~aeneas.synthesizer.Synthesizer` :rtype: tuple (handler, string, list) """ synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger) handler, path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) result = synthesizer.synthesize(text_file, path) anchors = result[0] return (handler, path, anchors, synthesizer.output_is_mono_wave)
def _synthesize(self, text_file): """ Synthesize text into a WAVE file. Return: 1. handler of the generated wave file 2. path of the generated wave file 3. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` 4. if the synthesizer produced a PCM16 mono WAVE file :param synthesizer: the synthesizer to use :type synthesizer: :class:`~aeneas.synthesizer.Synthesizer` :rtype: tuple (handler, string, list) """ synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger) handler, path = gf.tmp_file( suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) result = synthesizer.synthesize(text_file, path) anchors = result[0] return (handler, path, anchors, synthesizer.output_is_mono_wave)
def check_ffmpeg(cls): """ Check whether ``ffmpeg`` can be called. Return ``True`` on failure and ``False`` on success. :rtype: bool """ try: from aeneas.ffmpegwrapper import FFMPEGWrapper input_file_path = gf.absolute_path(u"tools/res/audio.mp3", __file__) handler, output_file_path = gf.tmp_file(suffix=u".wav") converter = FFMPEGWrapper() result = converter.convert(input_file_path, output_file_path) gf.delete_file(handler, output_file_path) if result: gf.print_success(u"ffmpeg OK") return False except: pass gf.print_error(u"ffmpeg ERROR") gf.print_info(u" Please make sure you have ffmpeg installed correctly") gf.print_info(u" and that its path is in your PATH environment variable") return True
def _compose_output_file_path(self, extension, output_file_path=None): """ If ``output_file_path`` is given, use it. Otherwise (``output_file_path`` is ``None``), create a temporary file with the correct extension. """ self.log(u"Determining output file path...") if output_file_path is None: self.log(u"output_file_path is None: creating temp file") handler, output_file_path = gf.tmp_file( root=self.rconf[RuntimeConfiguration.TMP_PATH], suffix=(".%s" % extension)) gf.delete_file(handler, output_file_path) else: self.log( u"output_file_path is not None: cheking that file can be written" ) if not gf.file_can_be_written(output_file_path): self.log_exc( u"Path '%s' cannot be written. Wrong permissions?" % (output_file_path), None, True, OSError) self.log(u"Determining output file path... done") self.log([u"Output file path is '%s'", output_file_path]) return output_file_path
def synthesize_multiple(self, audio_file_path, c_quit_after, c_backwards, u_text): """ Synthesize the text contained in the given fragment list into a ``wav`` file. :param string audio_file_path: the path to the output audio file :param float c_quit_after: stop synthesizing as soon as reaching this many seconds :param bool c_backwards: synthesizing from the end of the text file :param object u_text: a list of ``(voice_code, text)`` tuples :rtype: tuple ``(sample_rate, synthesized, intervals)`` """ self.log([u"Audio file path: '%s'", audio_file_path]) self.log([u"c_quit_after: '%.3f'", c_quit_after]) self.log([u"c_backwards: '%d'", c_backwards]) text_file_handler, text_file_path = gf.tmp_file() data_file_handler, data_file_path = gf.tmp_file() self.log([u"Temporary text file path: '%s'", text_file_path]) self.log([u"Temporary data file path: '%s'", data_file_path]) self.log(u"Populating the text file...") with io.open(text_file_path, "w", encoding="utf-8") as tmp_text_file: for f_voice_code, f_text in u_text: tmp_text_file.write(u"%s %s\n" % (f_voice_code, f_text)) self.log(u"Populating the text file... done") arguments = [ self.rconf[RuntimeConfiguration.CEW_SUBPROCESS_PATH], "-m", "aeneas.cewsubprocess", "%.3f" % c_quit_after, "%d" % c_backwards, text_file_path, audio_file_path, data_file_path, ] self.log([u"Calling with arguments '%s'", u" ".join(arguments)]) proc = subprocess.Popen( arguments, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True ) proc.communicate() self.log(u"Reading output data...") with io.open(data_file_path, "r", encoding="utf-8") as data_file: lines = data_file.read().splitlines() sr = int(lines[0]) sf = int(lines[1]) intervals = [] for line in lines[2:]: values = line.split(u" ") if len(values) == 2: intervals.append((TimeValue(values[0]), TimeValue(values[1]))) self.log(u"Reading output data... done") self.log(u"Deleting text and data files...") gf.delete_file(text_file_handler, text_file_path) gf.delete_file(data_file_handler, data_file_path) self.log(u"Deleting text and data files... done") return (sr, sf, intervals)
def _detect(self, min_length, max_length, tail=False): """ Detect the head or tail within ``min_length`` and ``max_length`` duration. If detecting the tail, the real wave MFCC and the query are reversed so that the tail detection problem reduces to a head detection problem. Return the duration of the head or tail, in seconds. :param min_length: estimated minimum length :type min_length: :class:`~aeneas.exacttiming.TimeValue` :param max_length: estimated maximum length :type max_length: :class:`~aeneas.exacttiming.TimeValue` :rtype: :class:`~aeneas.exacttiming.TimeValue` :raises: TypeError: if one of the parameters is not ``None`` or a number :raises: ValueError: if one of the parameters is negative """ def _sanitize(value, default, name): if value is None: value = default try: value = TimeValue(value) except (TypeError, ValueError, InvalidOperation) as exc: self.log_exc(u"The value of %s is not a number" % (name), exc, True, TypeError) if value < 0: self.log_exc(u"The value of %s is negative" % (name), None, True, ValueError) return value min_length = _sanitize(min_length, self.MIN_LENGTH, "min_length") max_length = _sanitize(max_length, self.MAX_LENGTH, "max_length") mws = self.rconf.mws min_length_frames = int(min_length / mws) max_length_frames = int(max_length / mws) self.log([u"MFCC window shift s: %.3f", mws]) self.log([u"Min start length s: %.3f", min_length]) self.log([u"Min start length frames: %d", min_length_frames]) self.log([u"Max start length s: %.3f", max_length]) self.log([u"Max start length frames: %d", max_length_frames]) self.log([u"Tail?: %s", str(tail)]) self.log(u"Synthesizing query...") synt_duration = max_length * self.QUERY_FACTOR self.log([u"Synthesizing at least %.3f seconds", synt_duration]) tmp_handler, tmp_file_path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) synt = Synthesizer(rconf=self.rconf, logger=self.logger) anchors, total_time, synthesized_chars = synt.synthesize( self.text_file, tmp_file_path, quit_after=synt_duration, backwards=tail ) self.log(u"Synthesizing query... done") self.log(u"Extracting MFCCs for query...") query_mfcc = AudioFileMFCC(tmp_file_path, rconf=self.rconf, logger=self.logger) self.log(u"Extracting MFCCs for query... done") self.log(u"Cleaning up...") gf.delete_file(tmp_handler, tmp_file_path) self.log(u"Cleaning up... done") search_window = max_length * self.AUDIO_FACTOR search_window_end = min(int(search_window / mws), self.real_wave_mfcc.all_length) self.log([u"Query MFCC length (frames): %d", query_mfcc.all_length]) self.log([u"Real MFCC length (frames): %d", self.real_wave_mfcc.all_length]) self.log([u"Search window end (s): %.3f", search_window]) self.log([u"Search window end (frames): %d", search_window_end]) if tail: self.log(u"Tail => reversing real_wave_mfcc and query_mfcc") self.real_wave_mfcc.reverse() query_mfcc.reverse() # NOTE: VAD will be run here, if not done before speech_intervals = self.real_wave_mfcc.intervals(speech=True, time=False) if len(speech_intervals) < 1: self.log(u"No speech intervals, hence no start found") if tail: self.real_wave_mfcc.reverse() return TimeValue("0.000") # generate a list of begin indices search_end = None candidates_begin = [] for interval in speech_intervals: if (interval[0] >= min_length_frames) and (interval[0] <= max_length_frames): candidates_begin.append(interval[0]) search_end = interval[1] if search_end >= search_window_end: break # for each begin index, compute the acm cost # to match the query # note that we take the min over the last column of the acm # meaning that we allow to match the entire query wave # against a portion of the real wave candidates = [] for candidate_begin in candidates_begin: self.log([u"Candidate interval starting at %d == %.3f", candidate_begin, candidate_begin * mws]) try: rwm = AudioFileMFCC( mfcc_matrix=self.real_wave_mfcc.all_mfcc[:, candidate_begin:search_end], rconf=self.rconf, logger=self.logger ) dtw = DTWAligner( real_wave_mfcc=rwm, synt_wave_mfcc=query_mfcc, rconf=self.rconf, logger=self.logger ) acm = dtw.compute_accumulated_cost_matrix() last_column = acm[:, -1] min_value = numpy.min(last_column) min_index = numpy.argmin(last_column) self.log([u"Candidate interval: %d %d == %.3f %.3f", candidate_begin, search_end, candidate_begin * mws, search_end * mws]) self.log([u" Min value: %.6f", min_value]) self.log([u" Min index: %d == %.3f", min_index, min_index * mws]) candidates.append((min_value, candidate_begin, min_index)) except Exception as exc: self.log_exc(u"An unexpected error occurred while running _detect", exc, False, None) # reverse again the real wave if tail: self.log(u"Tail => reversing real_wave_mfcc again") self.real_wave_mfcc.reverse() # return if len(candidates) < 1: self.log(u"No candidates found") return TimeValue("0.000") self.log(u"Candidates:") for candidate in candidates: self.log([u" Value: %.6f Begin Time: %.3f Min Index: %d", candidate[0], candidate[1] * mws, candidate[2]]) best = sorted(candidates)[0][1] self.log([u"Best candidate: %d == %.3f", best, best * mws]) return best * mws
def _time_and_combine(self, text_file): """ Combine original audio clips into a single WAV file. Return a tuple consisting of: 1. the handler of the generated audio file 2. the path of the generated audio file 3. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` 4. a tuple describing the format of the audio file :param text_file: the text with audio clips to be timed/combined :type text_file: :class:`~aeneas.textfile.TextFile` :rtype: tuple (handler, string, list) """ import subprocess # Concatenate all clips into a single, temporary file handler, path = gf.tmp_file( suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) cmd = "ffmpeg -y -f concat -i {} -c copy {}".format( text_file.file_path, path) subprocess.call(cmd, shell=True) audio_format = ('pcm_s161e', 1, 2) # Build "synt" anchor times anchor_time, anchors = TimeValue('0.0'), [] for fragment in text_file.fragments: audio_path = 'output/sample/{}'.format(fragment.text.split("'")[1]) audio_file = AudioFileMFCC(file_path=audio_path, file_format=audio_format) # TODO: Investigate faster ways to get the audio_length # cmd = 'ffprobe -i {} -show_entries format=duration -v quiet -of csv="p=0"'.format(audio_path) # subprocess.call(cmd, shell=True) # # should become... (to get response) # cmds = ['ffprobe', '-i', audio_path, '-show_entries', 'format=duration', # '-v', 'quiet', '-of', 'csv="p=0"'] # p = subprocess.Popen(cmds, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # output, err = p.communicate() # audio_length = TimeValue(output) anchors.append([anchor_time, fragment.identifier, audio_path]) anchor_time += audio_file.audio_length # [ # [TimeValue('0.0'), u'f000001', 'output/sample/audio01.wav'], # [TimeValue('0.339625'), u'f000002', 'output/sample/audio02.wav'], # [TimeValue('3.5526875'), u'f000003', 'output/sample/audio03.wav'], # [TimeValue('6.6874375'), u'f000004', 'output/sample/audio04.wav'], # [TimeValue('9.5609375'), u'f000005', 'output/sample/audio05.wav'], # [TimeValue('12.4344375'), u'f000006', 'output/sample/audio06.wav'], # [TimeValue('16.1961250'), u'f000007', 'output/sample/audio07.wav'], # [TimeValue('19.9578125'), u'f000008', 'output/sample/audio08.wav'], # [TimeValue('23.0925625'), u'f000009', 'output/sample/audio09.wav'], # [TimeValue('28.0297500'), u'f000010', 'output/sample/audio10.wav'], # [TimeValue('31.1645000'), u'f000011', 'output/sample/audio11.wav'], # [TimeValue('33.5678125'), u'f000012', 'output/sample/audio12.wav'], # [TimeValue('37.0943750'), u'f000013', 'output/sample/audio13.wav'], # [TimeValue('40.2030000'), u'f000014', 'output/sample/audio14.wav'], # [TimeValue('43.8601875'), u'f000015', 'output/sample/audio15.wav'] # ] # import pdb; pdb.set_trace() return (handler, path, anchors, audio_format)
def test_file_size_nonzero(self): handler, path = gf.tmp_file() with io.open(path, "w", encoding="utf-8") as tmp_file: tmp_file.write(u"Foo bar") self.assertEqual(gf.file_size(path), 7) gf.delete_file(handler, path)
def test_file_size_zero(self): handler, path = gf.tmp_file() self.assertEqual(gf.file_size(path), 0) gf.delete_file(handler, path)
def audio_from_youtube(self, source_url, download=True, output_file_path=None, preferred_index=None, largest_audio=True, preferred_format=None): """ Download an audio stream from a YouTube video, and save it to file. If ``download`` is ``False``, return the list of available audiostreams but do not download. Otherwise, download the audio stream best matching the provided parameters, as follows. If ``preferred_index`` is not ``None``, download the audio stream at that index. If ``largest_audio`` is ``True``, download the largest audiostream; otherwise, download the smallest audiostream. If ``preferred_format`` is not ``None``, download the audiostream having that format. The latter option works in combination with ``largest_audio``. Return the path of the downloaded file. :param string source_url: the URL of the YouTube video :param bool download: if ``True``, download the audio stream best matching ``preferred_index`` or ``preferred_format`` and ``largest_audio``; if ``False``, return the list of available audio streams :param string output_file_path: the path where the downloaded audio should be saved; if ``None``, create a temporary file :param int preferred_index: preferably download this audio stream :param bool largest_audio: if ``True``, download the largest audio stream available; if ``False``, download the smallest one. :param string preferred_format: preferably download this audio format :rtype: string or list of pafy audio streams :raises: ImportError: if ``pafy`` is not installed :raises: OSError: if ``output_file_path`` cannot be written :raises: ValueError: if ``source_url`` is not a valid YouTube URL """ def select_audiostream(audiostreams): """ Select the audiostream best matching the given parameters. """ if preferred_index is not None: if preferred_index in range(len(audiostreams)): self.log([ u"Selecting audiostream with index %d", preferred_index ]) return audiostreams[preferred_index] else: self.log_warn([ u"Audio stream index '%d' not allowed", preferred_index ]) self.log_warn(u"Ignoring the requested audio stream index") # selecting by preferred format streams = audiostreams if preferred_format is not None: self.log([ u"Selecting audiostreams by preferred format %s", preferred_format ]) streams = [ audiostream for audiostream in streams if audiostream.extension == preferred_format ] if len(streams) < 1: self.log([ u"No audiostream with preferred format %s", preferred_format ]) streams = audiostreams # sort by size streams = sorted([(audio.get_filesize(), audio) for audio in streams]) if largest_audio: self.log(u"Selecting largest audiostream") selected = streams[-1][1] else: self.log(u"Selecting smallest audiostream") selected = streams[0][1] return selected try: import pafy except ImportError as exc: self.log_exc(u"Python module pafy is not installed", exc, True, ImportError) try: video = pafy.new(source_url) except (IOError, OSError, ValueError) as exc: self.log_exc( u"The specified source URL '%s' is not a valid YouTube URL or you are offline" % (source_url), exc, True, ValueError) if not download: self.log(u"Returning the list of audio streams") return video.audiostreams output_path = output_file_path if output_file_path is None: self.log(u"output_path is None: creating temp file") handler, output_path = gf.tmp_file( root=self.rconf[RuntimeConfiguration.TMP_PATH]) else: if not gf.file_can_be_written(output_path): self.log_exc( u"Path '%s' cannot be written. Wrong permissions?" % (output_path), None, True, OSError) audiostream = select_audiostream(video.audiostreams) if output_file_path is None: gf.delete_file(handler, output_path) output_path += "." + audiostream.extension self.log([u"output_path is '%s'", output_path]) self.log(u"Downloading...") audiostream.download(filepath=output_path, quiet=True) self.log(u"Downloading... done") return output_path
def test_output_html_for_tuning(self): syn = self.read(SyncMapFormat.XML, multiline=True, utf8=True) handler, output_file_path = gf.tmp_file(suffix=".html") audio_file_path = "foo.mp3" syn.output_html_for_tuning(audio_file_path, output_file_path, None) gf.delete_file(handler, output_file_path)
def run(self, arguments, show_help=True): """ Program entry point. Please note that the first item in ``arguments`` is discarded, as it is assumed to be the script/invocation name; pass a "dumb" placeholder if you call this method with an argument different that ``sys.argv``. :param arguments: the list of arguments :type arguments: list :param show_help: if ``False``, do not show help on ``-h`` and ``--help`` :type show_help: bool :rtype: int """ # convert arguments into Unicode strings if self.use_sys: # check that sys.stdin.encoding and sys.stdout.encoding are set to utf-8 if not gf.FROZEN: if sys.stdin.encoding not in ["UTF-8", "UTF8"]: self.print_warning( u"The default input encoding is not UTF-8.") self.print_warning( u"You might want to set 'PYTHONIOENCODING=UTF-8' in your shell." ) if sys.stdout.encoding not in ["UTF-8", "UTF8"]: self.print_warning( u"The default output encoding is not UTF-8.") self.print_warning( u"You might want to set 'PYTHONIOENCODING=UTF-8' in your shell." ) # decode using sys.stdin.encoding args = [gf.safe_unicode_stdin(arg) for arg in arguments] else: # decode using utf-8 (but you should pass Unicode strings as parameters anyway) args = [gf.safe_unicode(arg) for arg in arguments] if show_help: if u"-h" in args: return self.print_help(short=True) if u"--help" in args: return self.print_help(short=False) if u"--version" in args: return self.print_name_version() # store formal arguments self.formal_arguments_raw = arguments self.formal_arguments = args # to obtain the actual arguments, # remove the first one and "special" switches args = args[1:] set_args = set(args) # set verbosity, if requested for flag in set([u"-v", u"--verbose"]) & set_args: self.verbose = True args.remove(flag) for flag in set([u"-vv", u"--very-verbose"]) & set_args: self.verbose = True self.very_verbose = True args.remove(flag) # set RuntimeConfiguration string, if specified for flag in [u"-r", u"--runtime-configuration"]: rconf_string = self.has_option_with_value(flag, actual_arguments=False) if rconf_string is not None: self.rconf = RuntimeConfiguration(rconf_string) args.remove("%s=%s" % (flag, rconf_string)) # set log file path, if requested log_path = None for flag in [u"-l", u"--log"]: log_path = self.has_option_with_value(flag, actual_arguments=False) if log_path is not None: args.remove("%s=%s" % (flag, log_path)) elif flag in set_args: handler, log_path = gf.tmp_file( suffix=u".log", root=self.rconf[RuntimeConfiguration.TMP_PATH]) args.remove(flag) if log_path is not None: self.log_file_path = log_path # if no actual arguments left, print help if (len(args) < 1) and (show_help): return self.print_help(short=True) # store actual arguments self.actual_arguments = args # create logger self.logger = Logger(tee=self.verbose, tee_show_datetime=self.very_verbose) self.log([u"Formal arguments: %s", self.formal_arguments]) self.log([u"Actual arguments: %s", self.actual_arguments]) self.log([u"Runtime configuration: '%s'", self.rconf.config_string()]) # perform command exit_code = self.perform_command() self.log([u"Execution completed with code %d", exit_code]) # output log if requested if self.log_file_path is not None: self.log([ u"User requested saving log to file '%s'", self.log_file_path ]) self.logger.write(self.log_file_path) if self.use_sys: self.print_info(u"Log written to file '%s'" % self.log_file_path) return self.exit(exit_code)
def _synthesize_single_subprocess_helper(self, text, voice_code, output_file_path=None, return_audio_data=True): """ This is an helper function to synthesize a single text fragment via ``subprocess``. If ``output_file_path`` is ``None``, the audio data will not persist to file at the end of the method. If ``return_audio_data`` is ``True``, return the audio data at the end of the function call; if ``False``, just return ``(True, None)`` in case of success. :rtype: tuple (result, (duration, sample_rate, codec, data)) or (result, None) """ # return zero if text is the empty string if len(text) == 0: # # NOTE sample_rate, codec, data do not matter # if the duration is 0.000 => set them to None # self.log(u"len(text) is zero: returning 0.000") return (True, (TimeValue("0.000"), None, None, None)) # create a temporary output file if needed synt_tmp_file = (output_file_path is None) if synt_tmp_file: self.log( u"Synthesizer helper called with output_file_path=None => creating temporary output file" ) output_file_handler, output_file_path = gf.tmp_file( suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) self.log([u"Temporary output file path is '%s'", output_file_path]) try: # if the TTS engine reads text from file, # write the text into a temporary file if self.CLI_PARAMETER_TEXT_PATH in self.subprocess_arguments: self.log(u"TTS engine reads text from file") tmp_text_file_handler, tmp_text_file_path = gf.tmp_file( suffix=u".txt", root=self.rconf[RuntimeConfiguration.TMP_PATH]) self.log([ u"Creating temporary text file '%s'...", tmp_text_file_path ]) with io.open(tmp_text_file_path, "w", encoding="utf-8") as tmp_text_file: tmp_text_file.write(text) self.log([ u"Creating temporary text file '%s'... done", tmp_text_file_path ]) else: self.log(u"TTS engine reads text from stdin") tmp_text_file_handler = None tmp_text_file_path = None # copy all relevant arguments self.log(u"Creating arguments list...") arguments = [] for arg in self.subprocess_arguments: if arg == self.CLI_PARAMETER_VOICE_CODE_FUNCTION: arguments.extend( self._voice_code_to_subprocess(voice_code)) elif arg == self.CLI_PARAMETER_VOICE_CODE_STRING: arguments.append(voice_code) elif arg == self.CLI_PARAMETER_TEXT_PATH: arguments.append(tmp_text_file_path) elif arg == self.CLI_PARAMETER_WAVE_PATH: arguments.append(output_file_path) elif arg == self.CLI_PARAMETER_TEXT_STDIN: # placeholder, do not append pass elif arg == self.CLI_PARAMETER_WAVE_STDOUT: # placeholder, do not append pass else: arguments.append(arg) self.log(u"Creating arguments list... done") # actual call via subprocess self.log(u"Calling TTS engine...") self.log([u"Calling with arguments '%s'", arguments]) self.log([u"Calling with text '%s'", text]) proc = subprocess.Popen(arguments, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) if self.CLI_PARAMETER_TEXT_STDIN in self.subprocess_arguments: self.log(u"Passing text via stdin...") if gf.PY2: (stdoutdata, stderrdata) = proc.communicate(input=gf.safe_bytes(text)) else: (stdoutdata, stderrdata) = proc.communicate(input=text) self.log(u"Passing text via stdin... done") else: self.log(u"Passing text via file...") (stdoutdata, stderrdata) = proc.communicate() self.log(u"Passing text via file... done") proc.stdout.close() proc.stdin.close() proc.stderr.close() if self.CLI_PARAMETER_WAVE_STDOUT in self.subprocess_arguments: self.log(u"TTS engine wrote audio data to stdout") self.log( [u"Writing audio data to file '%s'...", output_file_path]) with io.open(output_file_path, "wb") as output_file: output_file.write(stdoutdata) self.log([ u"Writing audio data to file '%s'... done", output_file_path ]) else: self.log(u"TTS engine wrote audio data to file") if tmp_text_file_path is not None: self.log( [u"Delete temporary text file '%s'", tmp_text_file_path]) gf.delete_file(tmp_text_file_handler, tmp_text_file_path) self.log(u"Calling TTS ... done") except Exception as exc: self.log_exc( u"An unexpected error occurred while calling TTS engine via subprocess", exc, False, None) return (False, None) # check the file can be read if not gf.file_can_be_read(output_file_path): self.log_exc( u"Output file '%s' cannot be read" % (output_file_path), None, True, None) return (False, None) # read audio data ret = self._read_audio_data( output_file_path) if return_audio_data else (True, None) # if the output file was temporary, remove it if synt_tmp_file: self.log([ u"Removing temporary output file path '%s'", output_file_path ]) gf.delete_file(output_file_handler, output_file_path) # return audio data or (True, None) return ret
def test_file_exists_true(self): handler, path = gf.tmp_file() self.assertTrue(gf.file_exists(path)) gf.delete_file(handler, path)
def test_delete_file_existing(self): handler, path = gf.tmp_file() self.assertTrue(gf.file_exists(path)) gf.delete_file(handler, path) self.assertFalse(gf.file_exists(path))
def read_samples_from_file(self): """ Load the audio samples from file into memory. If ``self.file_format`` is ``None`` or it is not ``("pcm_s16le", 1, self.rconf.sample_rate)``, the file will be first converted to a temporary PCM16 mono WAVE file. Audio data will be read from this temporary file, which will be then deleted from disk immediately. Otherwise, the audio data will be read directly from the given file, which will not be deleted from disk. :raises: :class:`~aeneas.audiofile.AudioFileConverterError`: if the path to the ``ffmpeg`` executable cannot be called :raises: :class:`~aeneas.audiofile.AudioFileUnsupportedFormatError`: if the audio file has a format not supported :raises: OSError: if the audio file cannot be read """ self.log(u"Loading audio data...") # check the file can be read if not gf.file_can_be_read(self.file_path): self.log_exc(u"File '%s' cannot be read" % (self.file_path), None, True, OSError) # determine if we need to convert the audio file convert_audio_file = ( (self.file_format is None) or ( (self.rconf.safety_checks) and (self.file_format != ("pcm_s16le", 1, self.rconf.sample_rate)) ) ) # convert the audio file if needed if convert_audio_file: # convert file to PCM16 mono WAVE with correct sample rate self.log(u"self.file_format is None or not good => converting self.file_path") tmp_handler, tmp_file_path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) self.log([u"Temporary PCM16 mono WAVE file: '%s'", tmp_file_path]) try: self.log(u"Converting audio file to mono...") converter = FFMPEGWrapper(rconf=self.rconf, logger=self.logger) converter.convert(self.file_path, tmp_file_path) self.file_format = ("pcm_s16le", 1, self.rconf.sample_rate) self.log(u"Converting audio file to mono... done") except FFMPEGPathError: gf.delete_file(tmp_handler, tmp_file_path) self.log_exc(u"Unable to call ffmpeg executable", None, True, AudioFileConverterError) except OSError: gf.delete_file(tmp_handler, tmp_file_path) self.log_exc(u"Audio file format not supported by ffmpeg", None, True, AudioFileUnsupportedFormatError) else: # read the file directly if self.rconf.safety_checks: self.log(u"self.file_format is good => reading self.file_path directly") else: self.log_warn(u"Safety checks disabled => reading self.file_path directly") tmp_handler = None tmp_file_path = self.file_path # TODO allow calling C extension cwave to read samples faster try: self.audio_format = "pcm16" self.audio_channels = 1 self.audio_sample_rate, self.__samples = scipywavread(tmp_file_path) # scipy reads a sample as an int16_t, that is, a number in [-32768, 32767] # so we convert it to a float64 in [-1, 1] self.__samples = self.__samples.astype("float64") / 32768 self.__samples_capacity = len(self.__samples) self.__samples_length = self.__samples_capacity self._update_length() except ValueError: self.log_exc(u"Audio format not supported by scipywavread", None, True, AudioFileUnsupportedFormatError) # if we converted the audio file, delete the temporary converted audio file if convert_audio_file: gf.delete_file(tmp_handler, tmp_file_path) self.log([u"Deleted temporary audio file: '%s'", tmp_file_path]) self._update_length() self.log([u"Sample length: %.3f", self.audio_length]) self.log([u"Sample rate: %d", self.audio_sample_rate]) self.log([u"Audio format: %s", self.audio_format]) self.log([u"Audio channels: %d", self.audio_channels]) self.log(u"Loading audio data... done")
def read_samples_from_file(self): """ Load the audio samples from file into memory. If ``self.is_mono_wave`` is ``False``, the file will be first converted to a temporary PCM16 mono WAVE file. Audio data will be read from this temporary file, which will be then deleted from disk immediately. If ``self.is_mono_wave`` is ``True``, the audio data will be read directly from the given file, which will not be deleted from disk. :raises: :class:`~aeneas.audiofile.AudioFileConverterError`: if the path to the ``ffmpeg`` executable cannot be called :raises: :class:`~aeneas.audiofile.AudioFileUnsupportedFormatError`: if the audio file has a format not supported :raises: OSError: if the audio file cannot be read """ self.log(u"Loading audio data...") # check the file can be read if not gf.file_can_be_read(self.file_path): self.log_exc(u"File '%s' cannot be read" % (self.file_path), None, True, OSError) # convert file to PCM16 mono WAVE if self.is_mono_wave: self.log(u"is_mono_wave=True => reading self.file_path directly") tmp_handler = None tmp_file_path = self.file_path else: self.log(u"is_mono_wave=False => converting self.file_path") tmp_handler, tmp_file_path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) self.log([u"Temporary PCM16 mono WAVE file: '%s'", tmp_file_path]) try: self.log(u"Converting audio file to mono...") converter = FFMPEGWrapper(rconf=self.rconf, logger=self.logger) converter.convert(self.file_path, tmp_file_path) self.log(u"Converting audio file to mono... done") except FFMPEGPathError: gf.delete_file(tmp_handler, tmp_file_path) self.log_exc(u"Unable to call ffmpeg executable", None, True, AudioFileConverterError) except OSError: gf.delete_file(tmp_handler, tmp_file_path) self.log_exc(u"Audio file format not supported by ffmpeg", None, True, AudioFileUnsupportedFormatError) # TODO allow calling C extension cwave to read samples faster try: self.audio_format = "pcm16" self.audio_channels = 1 self.audio_sample_rate, self.__samples = scipywavread(tmp_file_path) # scipy reads a sample as an int16_t, that is, a number in [-32768, 32767] # so we convert it to a float64 in [-1, 1] self.__samples = self.__samples.astype("float64") / 32768 self.__samples_capacity = len(self.__samples) self.__samples_length = self.__samples_capacity self._update_length() except ValueError: self.log_exc(u"Audio format not supported by scipywavread", None, True, AudioFileUnsupportedFormatError) if not self.is_mono_wave: gf.delete_file(tmp_handler, tmp_file_path) self.log([u"Deleted temporary PCM16 mono WAVE file: '%s'", tmp_file_path]) self._update_length() self.log([u"Sample length: %.3f", self.audio_length]) self.log([u"Sample rate: %d", self.audio_sample_rate]) self.log([u"Audio format: %s", self.audio_format]) self.log([u"Audio channels: %d", self.audio_channels]) self.log(u"Loading audio data... done")
def write(self, fmt, multiline=False, utf8=False, parameters=PARAMETERS): suffix = "." + fmt syn = self.read(SyncMapFormat.XML, multiline, utf8, self.PARAMETERS) handler, output_file_path = gf.tmp_file(suffix=suffix) syn.write(fmt, output_file_path, parameters) gf.delete_file(handler, output_file_path)
def _detect(self, min_length, max_length, tail=False): """ Detect the head or tail within ``min_length`` and ``max_length`` duration. If detecting the tail, the real wave MFCC and the query are reversed so that the tail detection problem reduces to a head detection problem. Return the duration of the head or tail, in seconds. :param min_length: estimated minimum length :type min_length: :class:`~aeneas.timevalue.TimeValue` :param max_length: estimated maximum length :type max_length: :class:`~aeneas.timevalue.TimeValue` :rtype: :class:`~aeneas.timevalue.TimeValue` :raises: TypeError: if one of the parameters is not ``None`` or a number :raises: ValueError: if one of the parameters is negative """ def _sanitize(value, default, name): if value is None: value = default try: value = TimeValue(value) except (TypeError, ValueError, InvalidOperation) as exc: self.log_exc(u"The value of %s is not a number" % (name), exc, True, TypeError) if value < 0: self.log_exc(u"The value of %s is negative" % (name), None, True, ValueError) return value min_length = _sanitize(min_length, self.MIN_LENGTH, "min_length") max_length = _sanitize(max_length, self.MAX_LENGTH, "max_length") mws = self.rconf.mws min_length_frames = int(min_length / mws) max_length_frames = int(max_length / mws) self.log([u"MFCC window shift s: %.3f", mws]) self.log([u"Min start length s: %.3f", min_length]) self.log([u"Min start length frames: %d", min_length_frames]) self.log([u"Max start length s: %.3f", max_length]) self.log([u"Max start length frames: %d", max_length_frames]) self.log([u"Tail?: %s", str(tail)]) self.log(u"Synthesizing query...") synt_duration = max_length * self.QUERY_FACTOR self.log([u"Synthesizing at least %.3f seconds", synt_duration]) tmp_handler, tmp_file_path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) synt = Synthesizer(rconf=self.rconf, logger=self.logger) anchors, total_time, synthesized_chars = synt.synthesize( self.text_file, tmp_file_path, quit_after=synt_duration, backwards=tail ) self.log(u"Synthesizing query... done") self.log(u"Extracting MFCCs for query...") query_mfcc = AudioFileMFCC(tmp_file_path, rconf=self.rconf, logger=self.logger) self.log(u"Extracting MFCCs for query... done") self.log(u"Cleaning up...") gf.delete_file(tmp_handler, tmp_file_path) self.log(u"Cleaning up... done") search_window = max_length * self.AUDIO_FACTOR search_window_end = min(int(search_window / mws), self.real_wave_mfcc.all_length) self.log([u"Query MFCC length (frames): %d", query_mfcc.all_length]) self.log([u"Real MFCC length (frames): %d", self.real_wave_mfcc.all_length]) self.log([u"Search window end (s): %.3f", search_window]) self.log([u"Search window end (frames): %d", search_window_end]) if tail: self.log(u"Tail => reversing real_wave_mfcc and query_mfcc") self.real_wave_mfcc.reverse() query_mfcc.reverse() # NOTE: VAD will be run here, if not done before speech_intervals = self.real_wave_mfcc.intervals(speech=True, time=False) if len(speech_intervals) < 1: self.log(u"No speech intervals, hence no start found") if tail: self.real_wave_mfcc.reverse() return TimeValue("0.000") # generate a list of begin indices search_end = None candidates_begin = [] for interval in speech_intervals: if (interval[0] >= min_length_frames) and (interval[0] <= max_length_frames): candidates_begin.append(interval[0]) search_end = interval[1] if search_end >= search_window_end: break # for each begin index, compute the acm cost # to match the query # note that we take the min over the last column of the acm # meaning that we allow to match the entire query wave # against a portion of the real wave candidates = [] for candidate_begin in candidates_begin: self.log([u"Candidate interval starting at %d == %.3f", candidate_begin, candidate_begin * mws]) try: rwm = AudioFileMFCC( mfcc_matrix=self.real_wave_mfcc.all_mfcc[:, candidate_begin:search_end], rconf=self.rconf, logger=self.logger ) dtw = DTWAligner( real_wave_mfcc=rwm, synt_wave_mfcc=query_mfcc, rconf=self.rconf, logger=self.logger ) acm = dtw.compute_accumulated_cost_matrix() last_column = acm[:, -1] min_value = numpy.min(last_column) min_index = numpy.argmin(last_column) self.log([u"Candidate interval: %d %d == %.3f %.3f", candidate_begin, search_end, candidate_begin * mws, search_end * mws]) self.log([u" Min value: %.6f", min_value]) self.log([u" Min index: %d == %.3f", min_index, min_index * mws]) candidates.append((min_value, candidate_begin, min_index)) except Exception as exc: self.log_exc(u"An unexpected error occurred while running _detect", exc, False, None) # reverse again the real wave if tail: self.log(u"Tail => reversing real_wave_mfcc again") self.real_wave_mfcc.reverse() # return if len(candidates) < 1: self.log(u"No candidates found") return TimeValue("0.000") self.log(u"Candidates:") for candidate in candidates: self.log([u" Value: %.6f Begin Time: %.3f Min Index: %d", candidate[0], candidate[1] * mws, candidate[2]]) best = sorted(candidates)[0][1] self.log([u"Best candidate: %d == %.3f", best, best * mws]) return best * mws
def synthesize_multiple(self, audio_file_path, c_quit_after, c_backwards, u_text): """ Synthesize the text contained in the given fragment list into a ``wav`` file. :param string audio_file_path: the path to the output audio file :param float c_quit_after: stop synthesizing as soon as reaching this many seconds :param bool c_backwards: synthesizing from the end of the text file :param object u_text: a list of ``(voice_code, text)`` tuples :rtype: tuple ``(sample_rate, synthesized, intervals)`` """ self.log([u"Audio file path: '%s'", audio_file_path]) self.log([u"c_quit_after: '%.3f'", c_quit_after]) self.log([u"c_backwards: '%d'", c_backwards]) text_file_handler, text_file_path = gf.tmp_file() data_file_handler, data_file_path = gf.tmp_file() self.log([u"Temporary text file path: '%s'", text_file_path]) self.log([u"Temporary data file path: '%s'", data_file_path]) self.log(u"Populating the text file...") with io.open(text_file_path, "w", encoding="utf-8") as tmp_text_file: for f_voice_code, f_text in u_text: tmp_text_file.write(u"%s %s\n" % (f_voice_code, f_text)) self.log(u"Populating the text file... done") arguments = [ self.rconf[RuntimeConfiguration.CEW_SUBPROCESS_PATH], "-m", "aeneas.cewsubprocess", "%.3f" % c_quit_after, "%d" % c_backwards, text_file_path, audio_file_path, data_file_path ] self.log([u"Calling with arguments '%s'", u" ".join(arguments)]) proc = subprocess.Popen(arguments, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) proc.communicate() self.log(u"Reading output data...") with io.open(data_file_path, "r", encoding="utf-8") as data_file: lines = data_file.read().splitlines() sr = int(lines[0]) sf = int(lines[1]) intervals = [] for line in lines[2:]: values = line.split(u" ") if len(values) == 2: intervals.append( (TimeValue(values[0]), TimeValue(values[1]))) self.log(u"Reading output data... done") self.log(u"Deleting text and data files...") gf.delete_file(text_file_handler, text_file_path) gf.delete_file(data_file_handler, data_file_path) self.log(u"Deleting text and data files... done") return (sr, sf, intervals)
def _synthesize_single_subprocess(self, text, voice_code, output_file_path): """ Synthesize a single text fragment via ``subprocess``. :rtype: tuple (result, (duration, sample_rate, encoding, samples)) """ self.log(u"Synthesizing using pure Python...") try: # if the TTS engine reads text from file, # write the text into a temporary file if self.CLI_PARAMETER_TEXT_PATH in self.subprocess_arguments: self.log(u"TTS engine reads text from file") tmp_text_file_handler, tmp_text_file_path = gf.tmp_file(suffix=u".txt", root=self.rconf[RuntimeConfiguration.TMP_PATH]) self.log([u"Creating temporary text file '%s'...", tmp_text_file_path]) with io.open(tmp_text_file_path, "w", encoding="utf-8") as tmp_text_file: tmp_text_file.write(text) self.log([u"Creating temporary text file '%s'... done", tmp_text_file_path]) else: self.log(u"TTS engine reads text from stdin") tmp_text_file_handler = None tmp_text_file_path = None # copy all relevant arguments self.log(u"Creating arguments list...") arguments = [] for arg in self.subprocess_arguments: if arg == self.CLI_PARAMETER_VOICE_CODE_FUNCTION: arguments.extend(self._voice_code_to_subprocess(voice_code)) elif arg == self.CLI_PARAMETER_VOICE_CODE_STRING: arguments.append(voice_code) elif arg == self.CLI_PARAMETER_TEXT_PATH: arguments.append(tmp_text_file_path) elif arg == self.CLI_PARAMETER_WAVE_PATH: arguments.append(output_file_path) elif arg == self.CLI_PARAMETER_TEXT_STDIN: # placeholder, do not append pass elif arg == self.CLI_PARAMETER_WAVE_STDOUT: # placeholder, do not append pass else: arguments.append(arg) self.log(u"Creating arguments list... done") # actual call via subprocess self.log(u"Calling TTS engine...") self.log([u"Calling with arguments '%s'", arguments]) self.log([u"Calling with text '%s'", text]) proc = subprocess.Popen( arguments, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True ) if self.CLI_PARAMETER_TEXT_STDIN in self.subprocess_arguments: self.log(u"Passing text via stdin...") if gf.PY2: (stdoutdata, stderrdata) = proc.communicate(input=gf.safe_bytes(text)) else: (stdoutdata, stderrdata) = proc.communicate(input=text) self.log(u"Passing text via stdin... done") else: self.log(u"Passing text via file...") (stdoutdata, stderrdata) = proc.communicate() self.log(u"Passing text via file... done") proc.stdout.close() proc.stdin.close() proc.stderr.close() if self.CLI_PARAMETER_WAVE_STDOUT in self.subprocess_arguments: self.log(u"TTS engine wrote audio data to stdout") self.log([u"Writing audio data to file '%s'...", output_file_path]) with io.open(output_file_path, "wb") as output_file: output_file.write(stdoutdata) self.log([u"Writing audio data to file '%s'... done", output_file_path]) else: self.log(u"TTS engine wrote audio data to file") if tmp_text_file_path is not None: self.log([u"Delete temporary text file '%s'", tmp_text_file_path]) gf.delete_file(tmp_text_file_handler, tmp_text_file_path) self.log(u"Calling TTS ... done") except Exception as exc: self.log_exc(u"An unexpected error occurred while calling TTS engine via subprocess", exc, False, None) return (False, None) # check the file can be read if not gf.file_can_be_read(output_file_path): self.log_exc(u"Output file '%s' cannot be read" % (output_file_path), None, True, None) return (False, None) # return the duration of the output file try: # if we know the TTS outputs to PCM16 mono WAVE, # we can read samples directly from it, # without an intermediate conversion through ffmpeg audio_file = AudioFile( file_path=output_file_path, is_mono_wave=self.OUTPUT_MONO_WAVE, rconf=self.rconf, logger=self.logger ) audio_file.read_samples_from_file() self.log([u"Duration of '%s': %f", output_file_path, audio_file.audio_length]) self.log(u"Synthesizing using pure Python... done") return (True, ( audio_file.audio_length, audio_file.audio_sample_rate, audio_file.audio_format, audio_file.audio_samples )) except (AudioFileUnsupportedFormatError, OSError) as exc: self.log_exc(u"An unexpected error occurred while trying to read the sythesized audio file", exc, True, None) return (False, None)