def inner(c_ext, cew_subprocess, cache): if ofp is None: handler, output_file_path = gf.tmp_file(suffix=".wav") else: handler = None output_file_path = ofp try: rconf = RuntimeConfiguration() rconf[RuntimeConfiguration.TTS] = self.TTS rconf[RuntimeConfiguration.TTS_PATH] = self.TTS_PATH rconf[RuntimeConfiguration.C_EXTENSIONS] = c_ext rconf[RuntimeConfiguration. CEW_SUBPROCESS_ENABLED] = cew_subprocess rconf[RuntimeConfiguration.TTS_CACHE] = cache tts_engine = self.TTS_CLASS(rconf=rconf) anchors, total_time, num_chars = tts_engine.synthesize_multiple( text_file, output_file_path, quit_after, backwards) gf.delete_file(handler, output_file_path) if cache: tts_engine.clear_cache() if zero_length: self.assertEqual(total_time, 0.0) else: self.assertGreater(total_time, 0.0) except (OSError, TypeError, UnicodeDecodeError, ValueError) as exc: gf.delete_file(handler, output_file_path) if (cache) and (tts_engine is not None): tts_engine.clear_cache() with self.assertRaises(expected_exc): raise exc
def synthesize_multiple(self, text_file, ofp=None, quit_after=None, backwards=False, zero_length=False): if ofp is None: handler, output_file_path = gf.tmp_file(suffix=".wav") else: handler = None output_file_path = ofp try: rconf = RuntimeConfiguration() rconf[RuntimeConfiguration.TTS] = u"festival" rconf[RuntimeConfiguration.TTS_PATH] = u"text2wave" tts_engine = FESTIVALWrapper(rconf=rconf) anchors, total_time, num_chars = tts_engine.synthesize_multiple( text_file, output_file_path, quit_after, backwards) gf.delete_file(handler, output_file_path) if zero_length: self.assertEqual(total_time, 0.0) else: self.assertGreater(total_time, 0.0) except (OSError, TypeError, UnicodeDecodeError, ValueError) as exc: gf.delete_file(handler, output_file_path) raise exc
def align(text_path, audio_path, align_out_path, word_align=True): # create Task object config_string = u"task_language=hi" config_string += "|os_task_file_format=json" rconf = None if word_align: config_string += "|os_task_file_levels=3" config_string += "|is_text_type=mplain" rconf = RuntimeConfiguration() rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH] = True rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH_L3] = True else: config_string += "|is_text_type=plain" task = Task(config_string=config_string) task.text_file_path_absolute = text_path task.audio_file_path_absolute = audio_path task.sync_map_file_path_absolute = align_out_path # process Task ExecuteTask(task, rconf=rconf).execute() # output sync map to file task.output_sync_map_file() # Remove annoying unicode characters with open(align_out_path, 'r', encoding='utf8') as f: alignment = json.load(f) with open(align_out_path, 'w', encoding='utf8') as f: json.dump(alignment, f, ensure_ascii=False, indent=2)
def test_use_cache(self): if self.TTS == u"": return rconf = RuntimeConfiguration() rconf[RuntimeConfiguration.TTS_CACHE] = True tts_engine = self.TTS_CLASS(rconf=rconf) self.assertTrue(tts_engine.use_cache) self.assertIsNotNone(tts_engine.cache)
def test_set_tts(self): rconf = RuntimeConfiguration() rconf.set_tts(level=1) self.assertEqual(rconf.tts, "espeak") self.assertEqual(rconf.tts_path, None) rconf.set_tts(level=2) self.assertEqual(rconf.tts, "espeak") self.assertEqual(rconf.tts_path, None) rconf.set_tts(level=3) self.assertEqual(rconf.tts, "espeak") self.assertEqual(rconf.tts_path, None)
def test_set_granularity(self): rconf = RuntimeConfiguration() rconf.set_granularity(level=1) self.assertEqual(rconf.mmn, False) self.assertEqual(rconf.mwl, TimeValue("0.100")) self.assertEqual(rconf.mws, TimeValue("0.040")) rconf.set_granularity(level=2) self.assertEqual(rconf.mmn, False) self.assertEqual(rconf.mwl, TimeValue("0.050")) self.assertEqual(rconf.mws, TimeValue("0.020")) rconf.set_granularity(level=3) self.assertEqual(rconf.mmn, False) self.assertEqual(rconf.mwl, TimeValue("0.020")) self.assertEqual(rconf.mws, TimeValue("0.005"))
def synthesize_single(self, text, language, ofp=None, zero_length=False): if ofp is None: handler, output_file_path = gf.tmp_file(suffix=".wav") else: handler = None output_file_path = ofp try: rconf = RuntimeConfiguration() rconf[RuntimeConfiguration.TTS] = u"festival" rconf[RuntimeConfiguration.TTS_PATH] = u"text2wave" tts_engine = FESTIVALWrapper(rconf=rconf) result = tts_engine.synthesize_single(text, language, output_file_path) gf.delete_file(handler, output_file_path) if zero_length: self.assertEqual(result, 0) else: self.assertGreater(result, 0) except (OSError, TypeError, UnicodeDecodeError, ValueError) as exc: gf.delete_file(handler, output_file_path) raise exc
def __adjust_durations(self, subs: List[SubRipItem], audio_file_path: str, stretch_in_lang: str) -> List[SubRipItem]: from aeneas.executetask import ExecuteTask from aeneas.task import Task from aeneas.runtimeconfiguration import RuntimeConfiguration from aeneas.logger import Logger as AeneasLogger # Initialise a DTW alignment task task_config_string = ( "task_language={}|os_task_file_format=srt|is_text_type=subtitles".format(stretch_in_lang) ) runtime_config_string = "dtw_algorithm=stripe" # stripe or exact task = Task(config_string=task_config_string) try: segment_path, _ = MediaHelper.extract_audio_from_start_to_end( audio_file_path, str(subs[0].start), str(subs[len(subs) - 1].end), ) # Create a text file for DTW alignments root, _ = os.path.splitext(segment_path) text_file_path = "{}.txt".format(root) with open(text_file_path, "w", encoding="utf8") as text_file: for sub_new in subs: text_file.write(sub_new.text) text_file.write(os.linesep * 2) task.audio_file_path_absolute = segment_path task.text_file_path_absolute = text_file_path task.sync_map_file_path_absolute = "{}.srt".format(root) tee = False if Logger.VERBOSE: tee = True if Logger.QUIET: tee = False with self.__lock: # Execute the task ExecuteTask( task=task, rconf=RuntimeConfiguration(config_string=runtime_config_string), logger=AeneasLogger(tee=tee), ).execute() # Output new subtitle segment to a file task.output_sync_map_file() # Load the above subtitle segment adjusted_subs = Subtitle.load( task.sync_map_file_path_absolute ).subs for index, sub_new_loaded in enumerate(adjusted_subs): sub_new_loaded.index = subs[index].index adjusted_subs.shift( seconds=MediaHelper.get_duration_in_seconds( start=None, end=str(subs[0].start) ) ) return adjusted_subs finally: # Housekeep intermediate files if task.audio_file_path_absolute is not None and os.path.exists( task.audio_file_path_absolute ): os.remove(task.audio_file_path_absolute) if task.text_file_path_absolute is not None and os.path.exists( task.text_file_path_absolute ): os.remove(task.text_file_path_absolute) if task.sync_map_file_path_absolute is not None and os.path.exists(task.sync_map_file_path_absolute): os.remove(task.sync_map_file_path_absolute)
def test_config_string(self): rconf = RuntimeConfiguration() rconf.config_string
def test_tts(self): rconf = RuntimeConfiguration() self.assertEqual(rconf.tts, "espeak")
def test_tts_path(self): rconf = RuntimeConfiguration() self.assertEqual(rconf.tts_path, None)
def test_mmn(self): rconf = RuntimeConfiguration() self.assertEqual(rconf.mmn, False)
def test_mwl(self): rconf = RuntimeConfiguration() self.assertEqual(rconf.mwl, TimeValue("0.100"))
def run(self, arguments, show_help=True): """ Program entry point. Please note that the first item in ``arguments`` is discarded, as it is assumed to be the script/invocation name; pass a "dumb" placeholder if you call this method with an argument different that ``sys.argv``. :param arguments: the list of arguments :type arguments: list :param show_help: if ``False``, do not show help on ``-h`` and ``--help`` :type show_help: bool :rtype: int """ # convert arguments into Unicode strings if self.use_sys: # check that sys.stdin.encoding and sys.stdout.encoding are set to utf-8 if not gf.FROZEN: if sys.stdin.encoding not in ["UTF-8", "UTF8"]: self.print_warning( u"The default input encoding is not UTF-8.") self.print_warning( u"You might want to set 'PYTHONIOENCODING=UTF-8' in your shell." ) if sys.stdout.encoding not in ["UTF-8", "UTF8"]: self.print_warning( u"The default output encoding is not UTF-8.") self.print_warning( u"You might want to set 'PYTHONIOENCODING=UTF-8' in your shell." ) # decode using sys.stdin.encoding args = [gf.safe_unicode_stdin(arg) for arg in arguments] else: # decode using utf-8 (but you should pass Unicode strings as parameters anyway) args = [gf.safe_unicode(arg) for arg in arguments] if show_help: if u"-h" in args: return self.print_help(short=True) if u"--help" in args: return self.print_help(short=False) if u"--version" in args: return self.print_name_version() # store formal arguments self.formal_arguments_raw = arguments self.formal_arguments = args # to obtain the actual arguments, # remove the first one and "special" switches args = args[1:] set_args = set(args) # set verbosity, if requested for flag in set([u"-v", u"--verbose"]) & set_args: self.verbose = True args.remove(flag) for flag in set([u"-vv", u"--very-verbose"]) & set_args: self.verbose = True self.very_verbose = True args.remove(flag) # set RuntimeConfiguration string, if specified for flag in [u"-r", u"--runtime-configuration"]: rconf_string = self.has_option_with_value(flag, actual_arguments=False) if rconf_string is not None: self.rconf = RuntimeConfiguration(rconf_string) args.remove("%s=%s" % (flag, rconf_string)) # set log file path, if requested log_path = None for flag in [u"-l", u"--log"]: log_path = self.has_option_with_value(flag, actual_arguments=False) if log_path is not None: args.remove("%s=%s" % (flag, log_path)) elif flag in set_args: handler, log_path = gf.tmp_file( suffix=u".log", root=self.rconf[RuntimeConfiguration.TMP_PATH]) args.remove(flag) if log_path is not None: self.log_file_path = log_path # if no actual arguments left, print help if (len(args) < 1) and (show_help): return self.print_help(short=True) # store actual arguments self.actual_arguments = args # create logger self.logger = Logger(tee=self.verbose, tee_show_datetime=self.very_verbose) self.log([u"Formal arguments: %s", self.formal_arguments]) self.log([u"Actual arguments: %s", self.actual_arguments]) self.log([u"Runtime configuration: '%s'", self.rconf.config_string()]) # perform command exit_code = self.perform_command() self.log([u"Execution completed with code %d", exit_code]) # output log if requested if self.log_file_path is not None: self.log([ u"User requested saving log to file '%s'", self.log_file_path ]) self.logger.write(self.log_file_path) if self.use_sys: self.print_info(u"Log written to file '%s'" % self.log_file_path) return self.exit(exit_code)
def test_dtw_margin(self): rconf = RuntimeConfiguration() self.assertEqual(rconf.dtw_margin, TimeValue("60.000"))
def __init__(self, logger=None, rconf=None): self.logger = logger if logger is not None else Logger() self.rconf = rconf if rconf is not None else RuntimeConfiguration()
def test_clone(self): rconf = RuntimeConfiguration() rconf2 = rconf.clone() self.assertNotEqual(id(rconf), id(rconf2)) self.assertEqual(rconf.config_string, rconf2.config_string)
if lang not in ["eng", "hi", "hin"]: print("only hi and eng allowed for language") exit(1) from aeneas.executetask import ExecuteTask from aeneas.task import Task from aeneas.runtimeconfiguration import RuntimeConfiguration config_string = u"task_language=" + lang + u"|is_text_type=subtitles|os_task_file_format=srt" tempout, tempfilename = tempfile.mkstemp() task = Task(config_string=config_string) task.audio_file_path_absolute = args.audio task.text_file_path_absolute = args.txt task.sync_map_file_path_absolute = tempfilename rconf = RuntimeConfiguration() # This option ignores the non-word sounds in the audio rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH] = True rconf[RuntimeConfiguration.MFCC_MASK_LOG_ENERGY_THRESHOLD] = 2.5 # To use a different Text-to-Speech engine #rconf[RuntimeConfiguration.TTS] = "festival" # process Task ExecuteTask(task, rconf=rconf).execute() # output sync map to file task.output_sync_map_file() f = open(args.out, "w") f.writelines("WEBVTT\n")
def test_loggable_rconf(self): rconf = RuntimeConfiguration() loggable = Loggable(rconf=rconf) self.assertEqual(rconf, loggable.rconf) self.assertIsNotNone(loggable.logger)
def test_loggable_rconf_logger(self): logger = Logger() rconf = RuntimeConfiguration() loggable = Loggable(rconf=rconf, logger=logger) self.assertEqual(rconf, loggable.rconf) self.assertEqual(logger, loggable.logger)
def test_set_rconf_string(self): params = [ (u"aba_nonspeech_tolerance=0.040", "aba_nonspeech_tolerance", TimeValue("0.040")), (u"aba_no_zero_duration=0.040", "aba_no_zero_duration", TimeValue("0.040")), (u"allow_unlisted_languages=True", "allow_unlisted_languages", True), (u"c_extensions=False", "c_extensions", False), (u"cdtw=False", "cdtw", False), (u"cew=False", "cew", False), (u"cmfcc=False", "cmfcc", False), (u"cew_subprocess_enabled=True", "cew_subprocess_enabled", True), (u"cew_subprocess_path=/foo/bar/python", "cew_subprocess_path", "/foo/bar/python"), (u"downloader_sleep=5.000", "downloader_sleep", TimeValue("5.000")), (u"downloader_retry_attempts=5", "downloader_retry_attempts", 5), (u"dtw_algorithm=exact", "dtw_algorithm", "exact"), (u"dtw_margin=100", "dtw_margin", TimeValue("100")), (u"ffmpeg_path=/foo/bar/ffmpeg", "ffmpeg_path", "/foo/bar/ffmpeg"), (u"ffmpeg_sample_rate=8000", "ffmpeg_sample_rate", 8000), (u"ffprobe_path=/foo/bar/ffprobe", "ffprobe_path", "/foo/bar/ffprobe"), (u"job_max_tasks=10", "job_max_tasks", 10), (u"mfcc_filters=100", "mfcc_filters", 100), (u"mfcc_size=20", "mfcc_size", 20), (u"mfcc_fft_order=256", "mfcc_fft_order", 256), (u"mfcc_lower_frequency=120.0", "mfcc_lower_frequency", 120.0), (u"mfcc_upper_frequency=5000.0", "mfcc_upper_frequency", 5000.0), (u"mfcc_emphasis_factor=1.0", "mfcc_emphasis_factor", 1.0), (u"mfcc_mask_nonspeech=True", "mfcc_mask_nonspeech", True), (u"mfcc_window_length=0.360", "mfcc_window_length", TimeValue("0.360")), (u"mfcc_window_shift=0.160", "mfcc_window_shift", TimeValue("0.160")), (u"dtw_margin_l1=100", "dtw_margin_l1", TimeValue("100")), (u"mfcc_mask_nonspeech_l1=True", "mfcc_mask_nonspeech_l1", True), (u"mfcc_window_length_l1=0.360", "mfcc_window_length_l1", TimeValue("0.360")), (u"mfcc_window_shift_l1=0.160", "mfcc_window_shift_l1", TimeValue("0.160")), (u"dtw_margin_l2=30", "dtw_margin_l2", TimeValue("30")), (u"mfcc_mask_nonspeech_l2=True", "mfcc_mask_nonspeech_l2", True), (u"mfcc_window_length_l2=0.360", "mfcc_window_length_l2", TimeValue("0.360")), (u"mfcc_window_shift_l2=0.160", "mfcc_window_shift_l2", TimeValue("0.160")), (u"dtw_margin_l3=10", "dtw_margin_l3", TimeValue("10")), (u"mfcc_mask_nonspeech_l3=True", "mfcc_mask_nonspeech_l3", True), (u"mfcc_window_length_l3=0.360", "mfcc_window_length_l3", TimeValue("0.360")), (u"mfcc_window_shift_l3=0.160", "mfcc_window_shift_l3", TimeValue("0.160")), (u"mfcc_mask_extend_speech_after=1", "mfcc_mask_extend_speech_after", 1), (u"mfcc_mask_extend_speech_before=1", "mfcc_mask_extend_speech_before", 1), (u"mfcc_mask_log_energy_threshold=0.750", "mfcc_mask_log_energy_threshold", 0.750), (u"mfcc_mask_min_nonspeech_length=5", "mfcc_mask_min_nonspeech_length", 5), (u"nuance_tts_api_id=foo", "nuance_tts_api_id", "foo"), (u"nuance_tts_api_key=bar", "nuance_tts_api_key", "bar"), (u"safety_checks=False", "safety_checks", False), (u"task_max_audio_length=1000", "task_max_audio_length", TimeValue("1000")), (u"task_max_text_length=1000", "task_max_text_length", 1000), (u"tmp_path=/foo/bar", "tmp_path", "/foo/bar"), (u"tts=festival", "tts", "festival"), (u"tts_path=/foo/bar/festival", "tts_path", "/foo/bar/festival"), (u"tts_api_sleep=5.000", "tts_api_sleep", TimeValue("5.000")), (u"tts_api_retry_attempts=3", "tts_api_retry_attempts", 3), (u"tts_voice_code=ru", "tts_voice_code", "ru"), (u"tts_cache=True", "tts_cache", True), (u"tts_l1=festival", "tts_l1", "festival"), (u"tts_path_l1=/foo/bar/festival", "tts_path_l1", "/foo/bar/festival"), (u"tts_l2=festival", "tts_l2", "festival"), (u"tts_path_l2=/foo/bar/festival", "tts_path_l2", "/foo/bar/festival"), (u"tts_l3=festival", "tts_l3", "festival"), (u"tts_path_l3=/foo/bar/festival", "tts_path_l3", "/foo/bar/festival"), (u"vad_extend_speech_after=1.000", "vad_extend_speech_after", TimeValue("1.000")), (u"vad_extend_speech_before=1.000", "vad_extend_speech_before", TimeValue("1.000")), (u"vad_log_energy_threshold=0.750", "vad_log_energy_threshold", 0.750), (u"vad_min_nonspeech_length=0.500", "vad_min_nonspeech_length", TimeValue("0.500")), ] for string, key, value in params: rconf = RuntimeConfiguration(string) self.assertEqual(rconf[key], value)
def test_safety_checks(self): rconf = RuntimeConfiguration() self.assertEqual(rconf.safety_checks, True)
def test_convert_rc(self): rc = RuntimeConfiguration(u"ffmpeg_sample_rate=44100") for f in self.FILES: self.convert(f["path"], runtime_configuration=rc)
def test_sample_rate(self): rconf = RuntimeConfiguration() self.assertEqual(rconf.sample_rate, 16000)