def _synthesize(self): """ Synthesize text into a ``wav`` file. Return a quadruple: 1. a success bool flag 2. handler of the generated wave file 3. path of the generated wave file 4. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` """ self._log("Synthesizing text") handler = None path = None anchors = None try: self._log("Creating an output tempfile") handler, path = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) self._log("Creating Synthesizer object") synt = Synthesizer(logger=self.logger) self._log("Synthesizing...") anchors = synt.synthesize(self.task.text_file, path) self._log("Synthesizing... done") self._log("Synthesizing text: succeeded") return (True, handler, path, anchors) except: self._log("Synthesizing text: failed") return (False, handler, path, anchors)
def _synthesize(self): """ Synthesize text into a ``wav`` file. Return a quadruple: 1. a success bool flag 2. handler of the generated wave file 3. path of the generated wave file 4. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` """ self._log("Synthesizing text") handler = None path = None anchors = None try: self._log("Creating an output tempfile") handler, path = tempfile.mkstemp(suffix=".wav", dir=gf.custom_tmp_dir()) self._log("Creating Synthesizer object") synt = Synthesizer(logger=self.logger) self._log("Synthesizing...") result = synt.synthesize(self.task.text_file, path) anchors = result[0] self._log("Synthesizing... done") self._log("Synthesizing text: succeeded") return (True, handler, path, anchors) except Exception as e: self._log("Synthesizing text: failed") self._log(["Message: %s", str(e)]) return (False, handler, path, anchors)
def test_synthesize(self): handler, output_file_path = tempfile.mkstemp(suffix=".wav") tfl = TextFile(get_abs_path("res/inputtext/sonnet_plain.txt"), TextFileFormat.PLAIN) tfl.set_language(Language.EN) synth = Synthesizer() anchors = synth.synthesize(tfl, output_file_path) self.assertGreater(len(anchors), 0) os.remove(output_file_path)
def test_synthesize_with_unicode(self): handler, output_file_path = tempfile.mkstemp(suffix=".wav") tfl = TextFile(get_abs_path("res/inputtext/de_utf8.txt"), TextFileFormat.PARSED) tfl.set_language(Language.DE) synth = Synthesizer() anchors = synth.synthesize(tfl, output_file_path) self.assertGreater(len(anchors), 0) os.remove(output_file_path)
def perform(self, path, logger=None, quit_after=None, backwards=False): handler, output_file_path = tempfile.mkstemp(suffix=".wav") tfl = TextFile(get_abs_path(path), TextFileFormat.PLAIN) tfl.set_language(Language.EN) synth = Synthesizer(logger=logger) result = synth.synthesize(tfl, output_file_path, quit_after=quit_after, backwards=backwards) delete_file(handler, output_file_path) return result
def inner(c_ext, cew_subprocess): handler, output_file_path = gf.tmp_file(suffix=".wav") tfl = TextFile(gf.absolute_path(path, __file__), TextFileFormat.PLAIN) tfl.set_language(Language.ENG) synth = Synthesizer(logger=logger) synth.rconf[RuntimeConfiguration.C_EXTENSIONS] = c_ext synth.rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED] = cew_subprocess result = synth.synthesize(tfl, output_file_path, quit_after=quit_after, backwards=backwards) gf.delete_file(handler, output_file_path) self.assertEqual(len(result[0]), expected) if expected2 is not None: self.assertAlmostEqual(result[1], expected2, places=0)
def _synthesize(self, text_file): """ Synthesize text into a WAVE file. Return: 1. handler of the generated wave file 2. path of the generated wave file 3. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` 4. if the synthesizer produced a PCM16 mono WAVE file :param synthesizer: the synthesizer to use :type synthesizer: :class:`~aeneas.synthesizer.Synthesizer` :rtype: tuple (handler, string, list) """ synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger) handler, path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) result = synthesizer.synthesize(text_file, path) anchors = result[0] return (handler, path, anchors, synthesizer.output_is_mono_wave)
def _synthesize(self, text_file): """ Synthesize text into a WAVE file. Return: 1. handler of the generated wave file 2. path of the generated wave file 3. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` 4. if the synthesizer produced a PCM16 mono WAVE file :param synthesizer: the synthesizer to use :type synthesizer: :class:`~aeneas.synthesizer.Synthesizer` :rtype: tuple (handler, string, list) """ synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger) handler, path = gf.tmp_file( suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) result = synthesizer.synthesize(text_file, path) anchors = result[0] return (handler, path, anchors, synthesizer.output_is_mono_wave)
def test_clear_cache(self): synth = Synthesizer() synth.clear_cache()
def test_synthesize_none(self): synth = Synthesizer() with self.assertRaises(TypeError): synth.synthesize(None, self.PATH_NOT_WRITEABLE)
class ExecuteTask(Loggable): """ Execute a task, that is, compute the sync map for it. :param task: the task to be executed :type task: :class:`~aeneas.task.Task` :param rconf: a runtime configuration :type rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration` :param logger: the logger object :type logger: :class:`~aeneas.logger.Logger` """ TAG = u"ExecuteTask" def __init__(self, task=None, rconf=None, logger=None): super(ExecuteTask, self).__init__(rconf=rconf, logger=logger) self.task = task self.step_index = 1 self.step_label = u"" self.step_begin_time = None self.step_total = 0.000 self.synthesizer = None if task is not None: self.load_task(self.task) def load_task(self, task): """ Load the task from the given ``Task`` object. :param task: the task to load :type task: :class:`~aeneas.task.Task` :raises: :class:`~aeneas.executetask.ExecuteTaskInputError`: if ``task`` is not an instance of :class:`~aeneas.task.Task` """ if not isinstance(task, Task): self.log_exc(u"task is not an instance of Task", None, True, ExecuteTaskInputError) self.task = task def _step_begin(self, label, log=True): """ Log begin of a step """ if log: self.step_label = label self.step_begin_time = self.log(u"STEP %d BEGIN (%s)" % (self.step_index, label)) def _step_end(self, log=True): """ Log end of a step """ if log: step_end_time = self.log(u"STEP %d END (%s)" % (self.step_index, self.step_label)) diff = (step_end_time - self.step_begin_time) diff = float(diff.seconds + diff.microseconds / 1000000.0) self.step_total += diff self.log(u"STEP %d DURATION %.3f (%s)" % (self.step_index, diff, self.step_label)) self.step_index += 1 def _step_failure(self, exc): """ Log failure of a step """ self.log_crit(u"STEP %d (%s) FAILURE" % (self.step_index, self.step_label)) self.step_index += 1 self.log_exc(u"Unexpected error while executing task", exc, True, ExecuteTaskExecutionError) def _step_total(self): """ Log total """ self.log(u"STEP T DURATION %.3f" % (self.step_total)) def execute(self): """ Execute the task. The sync map produced will be stored inside the task object. :raises: :class:`~aeneas.executetask.ExecuteTaskInputError`: if there is a problem with the input parameters :raises: :class:`~aeneas.executetask.ExecuteTaskExecutionError`: if there is a problem during the task execution """ self.log(u"Executing task...") # check that we have the AudioFile object if self.task.audio_file is None: self.log_exc(u"The task does not seem to have its audio file set", None, True, ExecuteTaskInputError) if ((self.task.audio_file.audio_length is None) or (self.task.audio_file.audio_length <= 0)): self.log_exc(u"The task seems to have an invalid audio file", None, True, ExecuteTaskInputError) task_max_audio_length = self.rconf[ RuntimeConfiguration.TASK_MAX_AUDIO_LENGTH] if ((task_max_audio_length > 0) and (self.task.audio_file.audio_length > task_max_audio_length)): self.log_exc( u"The audio file of the task has length %.3f, more than the maximum allowed (%.3f)." % (self.task.audio_file.audio_length, task_max_audio_length), None, True, ExecuteTaskInputError) # check that we have the TextFile object if self.task.text_file is None: self.log_exc(u"The task does not seem to have its text file set", None, True, ExecuteTaskInputError) if len(self.task.text_file) == 0: self.log_exc(u"The task text file seems to have no text fragments", None, True, ExecuteTaskInputError) task_max_text_length = self.rconf[ RuntimeConfiguration.TASK_MAX_TEXT_LENGTH] if ((task_max_text_length > 0) and (len(self.task.text_file) > task_max_text_length)): self.log_exc( u"The text file of the task has %d fragments, more than the maximum allowed (%d)." % (len(self.task.text_file), task_max_text_length), None, True, ExecuteTaskInputError) if self.task.text_file.chars == 0: self.log_exc(u"The task text file seems to have empty text", None, True, ExecuteTaskInputError) self.log(u"Both audio and text input file are present") # execute self.step_index = 1 self.step_total = 0.000 if self.task.text_file.file_format in TextFileFormat.MULTILEVEL_VALUES: self._execute_multi_level_task() else: self._execute_single_level_task() self.log(u"Executing task... done") def _execute_single_level_task(self): """ Execute a single-level task """ self.log(u"Executing single level task...") try: # load audio file, extract MFCCs from real wave, clear audio file self._step_begin(u"extract MFCC real wave") real_wave_mfcc = self._extract_mfcc( file_path=self.task.audio_file_path_absolute, file_format=None, ) self._step_end() # compute head and/or tail and set it self._step_begin(u"compute head tail") (head_length, process_length, tail_length) = self._compute_head_process_tail(real_wave_mfcc) real_wave_mfcc.set_head_middle_tail(head_length, process_length, tail_length) self._step_end() # compute alignment, outputting a tree of time intervals self._set_synthesizer() sync_root = Tree() self._execute_inner(real_wave_mfcc, self.task.text_file, sync_root=sync_root, force_aba_auto=False, log=True, leaf_level=True) self._clear_cache_synthesizer() # create syncmap and add it to task self._step_begin(u"create sync map") self._create_sync_map(sync_root=sync_root) self._step_end() # log total self._step_total() self.log(u"Executing single level task... done") except Exception as exc: self._step_failure(exc) def _execute_multi_level_task(self): """ Execute a multi-level task """ self.log(u"Executing multi level task...") self.log(u"Saving rconf...") # save original rconf orig_rconf = self.rconf.clone() # clone rconfs and set granularity # TODO the following code assumes 3 levels: generalize this level_rconfs = [ None, self.rconf.clone(), self.rconf.clone(), self.rconf.clone() ] level_mfccs = [None, None, None, None] force_aba_autos = [None, False, False, True] for i in range(1, len(level_rconfs)): level_rconfs[i].set_granularity(i) self.log([u"Level %d mmn: %s", i, level_rconfs[i].mmn]) self.log([u"Level %d mwl: %.3f", i, level_rconfs[i].mwl]) self.log([u"Level %d mws: %.3f", i, level_rconfs[i].mws]) level_rconfs[i].set_tts(i) self.log([u"Level %d tts: %s", i, level_rconfs[i].tts]) self.log([u"Level %d tts_path: %s", i, level_rconfs[i].tts_path]) self.log(u"Saving rconf... done") try: self.log(u"Creating AudioFile object...") audio_file = self._load_audio_file() self.log(u"Creating AudioFile object... done") # extract MFCC for each level for i in range(1, len(level_rconfs)): self._step_begin(u"extract MFCC real wave level %d" % i) if (i == 1) or ( level_rconfs[i].mws != level_rconfs[i - 1].mws) or ( level_rconfs[i].mwl != level_rconfs[i - 1].mwl): self.rconf = level_rconfs[i] level_mfccs[i] = self._extract_mfcc(audio_file=audio_file) else: self.log(u"Keeping MFCC real wave from previous level") level_mfccs[i] = level_mfccs[i - 1] self._step_end() self.log(u"Clearing AudioFile object...") self.rconf = level_rconfs[1] self._clear_audio_file(audio_file) self.log(u"Clearing AudioFile object... done") # compute head tail for the entire real wave (level 1) self._step_begin(u"compute head tail") (head_length, process_length, tail_length) = self._compute_head_process_tail(level_mfccs[1]) level_mfccs[1].set_head_middle_tail(head_length, process_length, tail_length) self._step_end() # compute alignment at each level sync_root = Tree() sync_roots = [sync_root] text_files = [self.task.text_file] number_levels = len(level_rconfs) for i in range(1, number_levels): self._step_begin(u"compute alignment level %d" % i) self.rconf = level_rconfs[i] text_files, sync_roots = self._execute_level( level=i, audio_file_mfcc=level_mfccs[i], text_files=text_files, sync_roots=sync_roots, force_aba_auto=force_aba_autos[i], ) self._step_end() # restore original rconf, and create syncmap and add it to task self._step_begin(u"create sync map") self.rconf = orig_rconf self._create_sync_map(sync_root=sync_root) self._step_end() self._step_total() self.log(u"Executing multi level task... done") except Exception as exc: self._step_failure(exc) def _execute_level(self, level, audio_file_mfcc, text_files, sync_roots, force_aba_auto=False): """ Compute the alignment for all the nodes in the given level. Return a pair (next_level_text_files, next_level_sync_roots), containing two lists of text file subtrees and sync map subtrees on the next level. :param int level: the level :param audio_file_mfcc: the audio MFCC representation for this level :type audio_file_mfcc: :class:`~aeneas.audiofilemfcc.AudioFileMFCC` :param list text_files: a list of :class:`~aeneas.textfile.TextFile` objects, each representing a (sub)tree of the Task text file :param list sync_roots: a list of :class:`~aeneas.tree.Tree` objects, each representing a SyncMapFragment tree, one for each element in ``text_files`` :param bool force_aba_auto: if ``True``, force using the AUTO ABA algorithm :rtype: (list, list) """ self._set_synthesizer() next_level_text_files = [] next_level_sync_roots = [] for text_file_index, text_file in enumerate(text_files): self.log([u"Text level %d, fragment %d", level, text_file_index]) self.log([u" Len: %d", len(text_file)]) sync_root = sync_roots[text_file_index] if (level > 1) and (len(text_file) == 1): self.log( u"Level > 1 and only one text fragment => return trivial tree" ) self._append_trivial_tree(text_file, sync_root) elif (level > 1) and (sync_root.value.begin == sync_root.value.end): self.log( u"Level > 1 and parent has begin == end => return trivial tree" ) self._append_trivial_tree(text_file, sync_root) else: self.log( u"Level == 1 or more than one text fragment with non-zero parent => compute tree" ) if not sync_root.is_empty: begin = sync_root.value.begin end = sync_root.value.end self.log([u" Setting begin: %.3f", begin]) self.log([u" Setting end: %.3f", end]) audio_file_mfcc.set_head_middle_tail(head_length=begin, middle_length=(end - begin)) else: self.log(u" No begin or end to set") self._execute_inner(audio_file_mfcc, text_file, sync_root=sync_root, force_aba_auto=force_aba_auto, log=False, leaf_level=(level == 3)) # store next level roots next_level_text_files.extend(text_file.children_not_empty) # we added head and tail, we must not pass them to the next level next_level_sync_roots.extend(sync_root.children[1:-1]) self._clear_cache_synthesizer() return (next_level_text_files, next_level_sync_roots) def _execute_inner(self, audio_file_mfcc, text_file, sync_root=None, force_aba_auto=False, log=True, leaf_level=False): """ Align a subinterval of the given AudioFileMFCC with the given TextFile. Return the computed tree of time intervals, rooted at ``sync_root`` if the latter is not ``None``, or as a new ``Tree`` otherwise. The begin and end positions inside the AudioFileMFCC must have been set ahead by the caller. The text fragments being aligned are the vchildren of ``text_file``. :param audio_file_mfcc: the audio file MFCC representation :type audio_file_mfcc: :class:`~aeneas.audiofilemfcc.AudioFileMFCC` :param text_file: the text file subtree to align :type text_file: :class:`~aeneas.textfile.TextFile` :param sync_root: the tree node to which fragments should be appended :type sync_root: :class:`~aeneas.tree.Tree` :param bool force_aba_auto: if ``True``, do not run aba algorithm :param bool log: if ``True``, log steps :param bool leaf_level: alert aba if the computation is at a leaf level :rtype: :class:`~aeneas.tree.Tree` """ if 'timings' in text_file.file_path: self._step_begin(u"extract timings", log=log) synt_path, synt_anchors, synt_format = self._provide_times( text_file) self._step_end(log=log) self._step_begin(u"extract MFCC synt wave", log=log) synt_wave_mfcc = self._extract_mfcc( file_path=synt_path, file_format=synt_format, ) # gf.delete_file(synt_handler, synt_path) self._step_end(log=log) else: self._step_begin(u"synthesize text", log=log) func = '_time_and_combine' if 'clips' in text_file.file_path else '_synthesize' synt_handler, synt_path, synt_anchors, synt_format = getattr( self, func)(text_file) self._step_end(log=log) self._step_begin(u"extract MFCC synt wave", log=log) synt_wave_mfcc = self._extract_mfcc( file_path=synt_path, file_format=synt_format, ) gf.delete_file(synt_handler, synt_path) self._step_end(log=log) self._step_begin(u"align waves", log=log) indices = self._align_waves(audio_file_mfcc, synt_wave_mfcc, synt_anchors) self._step_end(log=log) self._step_begin(u"adjust boundaries", log=log) self._adjust_boundaries(indices, text_file, audio_file_mfcc, sync_root, force_aba_auto, leaf_level) self._step_end(log=log) def _provide_times(self, text_file): with open(text_file.file_path) as file: timings = [row.strip().split(',') for row in file.readlines()] synt_anchors = [[TimeValue(start), verse, file] for verse, start, file in timings] synt_wav = timings[0][-1] synt_format = ('pcm_s161e', 1, 2) return synt_wav, synt_anchors, synt_format def _time_and_combine(self, text_file): """ Combine original audio clips into a single WAV file. Return a tuple consisting of: 1. the handler of the generated audio file 2. the path of the generated audio file 3. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` 4. a tuple describing the format of the audio file :param text_file: the text with audio clips to be timed/combined :type text_file: :class:`~aeneas.textfile.TextFile` :rtype: tuple (handler, string, list) """ import subprocess # Concatenate all clips into a single, temporary file handler, path = gf.tmp_file( suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) cmd = "ffmpeg -y -f concat -i {} -c copy {}".format( text_file.file_path, path) subprocess.call(cmd, shell=True) audio_format = ('pcm_s161e', 1, 2) # Build "synt" anchor times anchor_time, anchors = TimeValue('0.0'), [] for fragment in text_file.fragments: audio_path = 'output/sample/{}'.format(fragment.text.split("'")[1]) audio_file = AudioFileMFCC(file_path=audio_path, file_format=audio_format) # TODO: Investigate faster ways to get the audio_length # cmd = 'ffprobe -i {} -show_entries format=duration -v quiet -of csv="p=0"'.format(audio_path) # subprocess.call(cmd, shell=True) # # should become... (to get response) # cmds = ['ffprobe', '-i', audio_path, '-show_entries', 'format=duration', # '-v', 'quiet', '-of', 'csv="p=0"'] # p = subprocess.Popen(cmds, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # output, err = p.communicate() # audio_length = TimeValue(output) anchors.append([anchor_time, fragment.identifier, audio_path]) anchor_time += audio_file.audio_length # [ # [TimeValue('0.0'), u'f000001', 'output/sample/audio01.wav'], # [TimeValue('0.339625'), u'f000002', 'output/sample/audio02.wav'], # [TimeValue('3.5526875'), u'f000003', 'output/sample/audio03.wav'], # [TimeValue('6.6874375'), u'f000004', 'output/sample/audio04.wav'], # [TimeValue('9.5609375'), u'f000005', 'output/sample/audio05.wav'], # [TimeValue('12.4344375'), u'f000006', 'output/sample/audio06.wav'], # [TimeValue('16.1961250'), u'f000007', 'output/sample/audio07.wav'], # [TimeValue('19.9578125'), u'f000008', 'output/sample/audio08.wav'], # [TimeValue('23.0925625'), u'f000009', 'output/sample/audio09.wav'], # [TimeValue('28.0297500'), u'f000010', 'output/sample/audio10.wav'], # [TimeValue('31.1645000'), u'f000011', 'output/sample/audio11.wav'], # [TimeValue('33.5678125'), u'f000012', 'output/sample/audio12.wav'], # [TimeValue('37.0943750'), u'f000013', 'output/sample/audio13.wav'], # [TimeValue('40.2030000'), u'f000014', 'output/sample/audio14.wav'], # [TimeValue('43.8601875'), u'f000015', 'output/sample/audio15.wav'] # ] # import pdb; pdb.set_trace() return (handler, path, anchors, audio_format) def _load_audio_file(self): """ Load audio in memory. :rtype: :class:`~aeneas.audiofile.AudioFile` """ self._step_begin(u"load audio file") # NOTE file_format=None forces conversion to # PCM16 mono WAVE with default sample rate audio_file = AudioFile(file_path=self.task.audio_file_path_absolute, file_format=None, rconf=self.rconf, logger=self.logger) audio_file.read_samples_from_file() self._step_end() return audio_file def _clear_audio_file(self, audio_file): """ Clear audio from memory. :param audio_file: the object to clear :type audio_file: :class:`~aeneas.audiofile.AudioFile` """ self._step_begin(u"clear audio file") audio_file.clear_data() audio_file = None self._step_end() def _extract_mfcc(self, file_path=None, file_format=None, audio_file=None): """ Extract the MFCCs from the given audio file. :rtype: :class:`~aeneas.audiofilemfcc.AudioFileMFCC` """ audio_file_mfcc = AudioFileMFCC(file_path=file_path, file_format=file_format, audio_file=audio_file, rconf=self.rconf, logger=self.logger) if self.rconf.mmn: self.log(u"Running VAD inside _extract_mfcc...") audio_file_mfcc.run_vad( log_energy_threshold=self.rconf[ RuntimeConfiguration.MFCC_MASK_LOG_ENERGY_THRESHOLD], min_nonspeech_length=self.rconf[ RuntimeConfiguration.MFCC_MASK_MIN_NONSPEECH_LENGTH], extend_before=self.rconf[ RuntimeConfiguration. MFCC_MASK_EXTEND_SPEECH_INTERVAL_BEFORE], extend_after=self.rconf[ RuntimeConfiguration. MFCC_MASK_EXTEND_SPEECH_INTERVAL_AFTER]) self.log(u"Running VAD inside _extract_mfcc... done") return audio_file_mfcc def _compute_head_process_tail(self, audio_file_mfcc): """ Set the audio file head or tail, by either reading the explicit values from the Task configuration, or using SD to determine them. This function returns the lengths, in seconds, of the (head, process, tail). :rtype: tuple (float, float, float) """ head_length = self.task.configuration["i_a_head"] process_length = self.task.configuration["i_a_process"] tail_length = self.task.configuration["i_a_tail"] head_max = self.task.configuration["i_a_head_max"] head_min = self.task.configuration["i_a_head_min"] tail_max = self.task.configuration["i_a_tail_max"] tail_min = self.task.configuration["i_a_tail_min"] if ((head_length is not None) or (process_length is not None) or (tail_length is not None)): self.log(u"Setting explicit head process tail") else: self.log(u"Detecting head tail...") sd = SD(audio_file_mfcc, self.task.text_file, rconf=self.rconf, logger=self.logger) head_length = TimeValue("0.000") process_length = None tail_length = TimeValue("0.000") if (head_min is not None) or (head_max is not None): self.log(u"Detecting HEAD...") head_length = sd.detect_head(head_min, head_max) self.log([u"Detected HEAD: %.3f", head_length]) self.log(u"Detecting HEAD... done") if (tail_min is not None) or (tail_max is not None): self.log(u"Detecting TAIL...") tail_length = sd.detect_tail(tail_min, tail_max) self.log([u"Detected TAIL: %.3f", tail_length]) self.log(u"Detecting TAIL... done") self.log(u"Detecting head tail... done") self.log([u"Head: %s", gf.safe_float(head_length, None)]) self.log([u"Process: %s", gf.safe_float(process_length, None)]) self.log([u"Tail: %s", gf.safe_float(tail_length, None)]) return (head_length, process_length, tail_length) def _set_synthesizer(self): """ Create synthesizer """ self.log(u"Setting synthesizer...") self.synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger) self.log(u"Setting synthesizer... done") def _clear_cache_synthesizer(self): """ Clear the cache of the synthesizer """ self.log(u"Clearing synthesizer...") self.synthesizer.clear_cache() self.log(u"Clearing synthesizer... done") def _synthesize(self, text_file): """ Synthesize text into a WAVE file. Return a tuple consisting of: 1. the handler of the generated audio file 2. the path of the generated audio file 3. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` 4. a tuple describing the format of the audio file :param text_file: the text to be synthesized :type text_file: :class:`~aeneas.textfile.TextFile` :rtype: tuple (handler, string, list) """ handler, path = gf.tmp_file( suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) result = self.synthesizer.synthesize(text_file, path) return (handler, path, result[0], self.synthesizer.output_audio_format) def _align_waves(self, real_wave_mfcc, synt_wave_mfcc, synt_anchors): """ Align two AudioFileMFCC objects, representing WAVE files. Return a list of boundary indices. """ self.log(u"Creating DTWAligner...") aligner = DTWAligner(real_wave_mfcc, synt_wave_mfcc, rconf=self.rconf, logger=self.logger) self.log(u"Creating DTWAligner... done") self.log(u"Computing boundary indices...") boundary_indices = aligner.compute_boundaries(synt_anchors) self.log(u"Computing boundary indices... done") return boundary_indices def _adjust_boundaries(self, boundary_indices, text_file, real_wave_mfcc, sync_root, force_aba_auto=False, leaf_level=False): """ Adjust boundaries as requested by the user. Return the computed time map, that is, a list of pairs ``[start_time, end_time]``, of length equal to number of fragments + 2, where the two extra elements are for the HEAD (first) and TAIL (last). """ # boundary_indices contains the boundary indices in the all_mfcc of real_wave_mfcc # starting with the (head-1st fragment) and ending with (-1th fragment-tail) aba_parameters = self.task.configuration.aba_parameters() if force_aba_auto: self.log(u"Forced running algorithm: 'auto'") aba_parameters["algorithm"] = (AdjustBoundaryAlgorithm.AUTO, []) # note that the other aba settings (nonspeech and nozero) # remain as specified by the user self.log([u"ABA parameters: %s", aba_parameters]) aba = AdjustBoundaryAlgorithm(rconf=self.rconf, logger=self.logger) aba.adjust(aba_parameters=aba_parameters, real_wave_mfcc=real_wave_mfcc, boundary_indices=boundary_indices, text_file=text_file, allow_arbitrary_shift=leaf_level) aba.append_fragment_list_to_sync_root(sync_root=sync_root) def _append_trivial_tree(self, text_file, sync_root): """ Append trivial tree, made by one HEAD, one sync map fragment for each element of ``text_file``, and one TAIL. This function is called if either ``text_file`` has only one element, or if ``sync_root.value`` is an interval with zero length (i.e., ``sync_root.value.begin == sync_root.value.end``). """ interval = sync_root.value # # NOTE the following is correct, but it is a bit obscure # time_values = [interval.begin] * (1 + len(text_file)) + [interval.end] * 2 # if len(text_file) == 1: time_values = [ interval.begin, interval.begin, interval.end, interval.end ] else: # interval.begin == interval.end time_values = [interval.begin] * (3 + len(text_file)) aba = AdjustBoundaryAlgorithm(rconf=self.rconf, logger=self.logger) aba.intervals_to_fragment_list(text_file=text_file, time_values=time_values) aba.append_fragment_list_to_sync_root(sync_root=sync_root) def _create_sync_map(self, sync_root): """ If requested, check that the computed sync map is consistent. Then, add it to the Task. """ sync_map = SyncMap(tree=sync_root, rconf=self.rconf, logger=self.logger) if self.rconf.safety_checks: self.log(u"Running sanity check on computed sync map...") if not sync_map.leaves_are_consistent: self._step_failure( ValueError( u"The computed sync map contains inconsistent fragments" )) self.log(u"Running sanity check on computed sync map... passed") else: self.log(u"Not running sanity check on computed sync map") self.task.sync_map = sync_map
def build_sync_map( text_paths, audio_paths, tmp_dir, sync_map_text_path_prefix, sync_map_audio_path_prefix, skip_penalty, radius ): """ This is an algorithm for building a sync map. It synthesizes text and then aligns synthesized audio with the recorded audio using a variation of the DTW (Dynamic Time Warping) algorithm. The main features of this algorithm are: 1) It can handle structural differences in the beginning and in the end of files. 2) It finds an approximation to an optimal warping path in linear time and space using FastDTW approach. Note that while the algorithm does not require one-to-one correspondance between text and audio files (i.e. the splitting can be done differently), the quality of the result is sensitive to the choice of skip_penalty and radius parameters, so it is recommended to have such a correspondance. Alignment details: Synthesized and recorded audio are represented as sequences of MFCC frames. These sequences are aligned using variation of the DTW algorithm. In contrast to the classic DTW, this algorithms can be used to align sequences with structural differences in the beginning or in the end. Steps to build a sync map: 1) Synthesize text file and produce a list of anchors. Each anchor represents the start of the corresponding text fragment in a synthesized audio. 2) Get sequences of MFCC frames of synthesized and recorded audio. 3) Get their warping path by calling the alignment algorithm. 4) Check whether the extra content is found, calculate mapping boundaries. 5) Map anchors inside the boundaries to the recorded MFCC sequence using warping path from step 3. 6) Start all over again considering: If there is an extra content in the end of synthesized sequence, align it with the next audio file. If there is an extra content in the end of recorded sequence, align it with the next text file. If both sequences have extra content in the end, align text tail with the next audio file. If none of the above, align next text and audio files. """ synthesizer = Synthesizer() parse_parameters = {'is_text_unparsed_id_regex': 'f[0-9]+'} sync_map = {} process_next_text = True process_next_audio = True while True: if process_next_text: try: text_path = next(text_paths) except StopIteration: break text_name = get_name_from_path(text_path) output_text_name = os.path.join(sync_map_text_path_prefix, text_name) textfile = TextFile(text_path, file_format=TextFileFormat.UNPARSED, parameters=parse_parameters) textfile.set_language(Language.ENG) text_wav_path = os.path.join(tmp_dir, f'{drop_extension(text_name)}_text.wav') sync_map[output_text_name] = {} # Produce synthesized audio, get anchors anchors,_,_ = synthesizer.synthesize(textfile, text_wav_path) # Get fragments, convert anchors timings to the frames indicies fragments = [a[1] for a in anchors] anchors = np.array([int(a[0] / TimeValue('0.040')) for a in anchors]) # MFCC frames sequence memory layout is a n x l 2D array, # where n - number of frames and l - number of MFFCs # i.e it is c-contiguous, but after dropping the first coefficient it siezes to be c-contiguous. # Should decide whether to make a copy or to work around the first coefficient. text_mfcc_sequence = np.ascontiguousarray( AudioFileMFCC(text_wav_path).all_mfcc.T[:, 1:] ) if process_next_audio: try: audio_path = next(audio_paths) except StopIteration: break audio_name = get_name_from_path(audio_path) output_audio_name = os.path.join(sync_map_audio_path_prefix, audio_name) audio_wav_path = os.path.join(tmp_dir, f'{drop_extension(audio_name)}_audio.wav') subprocess.run(['ffmpeg', '-n', '-i', audio_path, audio_wav_path]) audio_mfcc_sequence = np.ascontiguousarray( AudioFileMFCC(audio_wav_path).all_mfcc.T[:, 1:] ) # Keep track to calculate frames timings audio_start_frame = 0 n = len(text_mfcc_sequence) m = len(audio_mfcc_sequence) _, path = c_FastDTWBD(text_mfcc_sequence, audio_mfcc_sequence, skip_penalty, radius=radius) if len(path) == 0: print( f'No match between {text_name} and {audio_name}. ' f'Alignment is terminated. ' f'Adjust skip_penalty or input files.' ) return {} # Project path to the text and audio sequences text_path_frames = path[:,0] audio_path_frames = path[:,1] last_matched_audio_frame = audio_path_frames[-1] # Find first and last matched frames first_matched_text_frame = text_path_frames[0] last_matched_text_frame = text_path_frames[-1] # Map only those fragments that intersect matched frames anchors_boundary_indices = np.searchsorted( anchors, [first_matched_text_frame, last_matched_text_frame] ) map_anchors_from = max(anchors_boundary_indices[0] - 1, 0) map_anchors_to = anchors_boundary_indices[1] anchors_to_map = anchors[map_anchors_from:map_anchors_to] fragments_to_map = fragments[map_anchors_from:map_anchors_to] # Get anchors indicies in the path projection to the text sequence text_path_anchor_indices = np.searchsorted(text_path_frames, anchors_to_map) # Get anchors' frames in audio sequence, calculate their timings anchors_matched_frames = audio_path_frames[text_path_anchor_indices] timings = (np.append(anchors_matched_frames, audio_path_frames[-1]) + audio_start_frame) * 0.040 # Map fragment_ids to timings, update mapping of the current text file fragment_map = { f: { 'audio_file': output_audio_name, 'begin_time': time_to_str(bt), 'end_time': time_to_str(et) } for f, bt, et in zip(fragments_to_map, timings[:-1], timings[1:]) } sync_map[output_text_name].update(fragment_map) # Decide whether to process next file or to align the tail of the current one if map_anchors_to == len(anchors): # Process next text if no fragments are left process_next_text = True else: # Otherwise align tail of the current text process_next_text = False text_mfcc_sequence = text_mfcc_sequence[last_matched_text_frame:] fragments = fragments[map_anchors_to:] anchors = anchors[map_anchors_to:] - last_matched_text_frame if last_matched_audio_frame == m - 1 or not process_next_text: # Process next audio if there are no unmatched audio frames in the tail # or there are more text fragments to map, i.e. # we choose to process next audio if we cannot decide. # This strategy is correct if there are no extra fragments in the end. process_next_audio = True else: # Otherwise align tail of the current audio process_next_audio = False audio_mfcc_sequence = audio_mfcc_sequence[last_matched_audio_frame:] audio_start_frame += last_matched_audio_frame return sync_map
def _set_synthesizer(self): """ Create synthesizer """ self.log(u"Setting synthesizer...") self.synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger) self.log(u"Setting synthesizer... done")
class ExecuteTask(Loggable): """ Execute a task, that is, compute the sync map for it. :param task: the task to be executed :type task: :class:`~aeneas.task.Task` :param rconf: a runtime configuration :type rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration` :param logger: the logger object :type logger: :class:`~aeneas.logger.Logger` """ TAG = u"ExecuteTask" def __init__(self, task=None, rconf=None, logger=None): super(ExecuteTask, self).__init__(rconf=rconf, logger=logger) self.task = task self.step_index = 1 self.step_label = u"" self.step_begin_time = None self.step_total = 0.000 self.synthesizer = None if task is not None: self.load_task(self.task) def load_task(self, task): """ Load the task from the given ``Task`` object. :param task: the task to load :type task: :class:`~aeneas.task.Task` :raises: :class:`~aeneas.executetask.ExecuteTaskInputError`: if ``task`` is not an instance of :class:`~aeneas.task.Task` """ if not isinstance(task, Task): self.log_exc(u"task is not an instance of Task", None, True, ExecuteTaskInputError) self.task = task def _step_begin(self, label, log=True): """ Log begin of a step """ if log: self.step_label = label self.step_begin_time = self.log(u"STEP %d BEGIN (%s)" % (self.step_index, label)) def _step_end(self, log=True): """ Log end of a step """ if log: step_end_time = self.log(u"STEP %d END (%s)" % (self.step_index, self.step_label)) diff = (step_end_time - self.step_begin_time) diff = float(diff.seconds + diff.microseconds / 1000000.0) self.step_total += diff self.log(u"STEP %d DURATION %.3f (%s)" % (self.step_index, diff, self.step_label)) self.step_index += 1 def _step_failure(self, exc): """ Log failure of a step """ self.log_crit(u"STEP %d (%s) FAILURE" % (self.step_index, self.step_label)) self.step_index += 1 self.log_exc(u"Unexpected error while executing task", exc, True, ExecuteTaskExecutionError) def _step_total(self): """ Log total """ self.log(u"STEP T DURATION %.3f" % (self.step_total)) def execute(self): """ Execute the task. The sync map produced will be stored inside the task object. :raises: :class:`~aeneas.executetask.ExecuteTaskInputError`: if there is a problem with the input parameters :raises: :class:`~aeneas.executetask.ExecuteTaskExecutionError`: if there is a problem during the task execution """ self.log(u"Executing task...") # check that we have the AudioFile object if self.task.audio_file is None: self.log_exc(u"The task does not seem to have its audio file set", None, True, ExecuteTaskInputError) if ( (self.task.audio_file.audio_length is None) or (self.task.audio_file.audio_length <= 0) ): self.log_exc(u"The task seems to have an invalid audio file", None, True, ExecuteTaskInputError) task_max_audio_length = self.rconf[RuntimeConfiguration.TASK_MAX_AUDIO_LENGTH] if ( (task_max_audio_length > 0) and (self.task.audio_file.audio_length > task_max_audio_length) ): self.log_exc(u"The audio file of the task has length %.3f, more than the maximum allowed (%.3f)." % (self.task.audio_file.audio_length, task_max_audio_length), None, True, ExecuteTaskInputError) # check that we have the TextFile object if self.task.text_file is None: self.log_exc(u"The task does not seem to have its text file set", None, True, ExecuteTaskInputError) if len(self.task.text_file) == 0: self.log_exc(u"The task text file seems to have no text fragments", None, True, ExecuteTaskInputError) task_max_text_length = self.rconf[RuntimeConfiguration.TASK_MAX_TEXT_LENGTH] if ( (task_max_text_length > 0) and (len(self.task.text_file) > task_max_text_length) ): self.log_exc(u"The text file of the task has %d fragments, more than the maximum allowed (%d)." % (len(self.task.text_file), task_max_text_length), None, True, ExecuteTaskInputError) if self.task.text_file.chars == 0: self.log_exc(u"The task text file seems to have empty text", None, True, ExecuteTaskInputError) self.log(u"Both audio and text input file are present") # execute self.step_index = 1 self.step_total = 0.000 if self.task.text_file.file_format in TextFileFormat.MULTILEVEL_VALUES: self._execute_multi_level_task() else: self._execute_single_level_task() self.log(u"Executing task... done") def _execute_single_level_task(self): """ Execute a single-level task """ self.log(u"Executing single level task...") try: # load audio file, extract MFCCs from real wave, clear audio file self._step_begin(u"extract MFCC real wave") real_wave_mfcc = self._extract_mfcc( file_path=self.task.audio_file_path_absolute, file_format=None, ) self._step_end() # compute head and/or tail and set it self._step_begin(u"compute head tail") (head_length, process_length, tail_length) = self._compute_head_process_tail(real_wave_mfcc) real_wave_mfcc.set_head_middle_tail(head_length, process_length, tail_length) self._step_end() # compute alignment, outputting a tree of time intervals self._set_synthesizer() sync_root = Tree() self._execute_inner( real_wave_mfcc, self.task.text_file, sync_root=sync_root, force_aba_auto=False, log=True, leaf_level=True ) self._clear_cache_synthesizer() # create syncmap and add it to task self._step_begin(u"create sync map") self._create_sync_map(sync_root=sync_root) self._step_end() # log total self._step_total() self.log(u"Executing single level task... done") except Exception as exc: self._step_failure(exc) def _execute_multi_level_task(self): """ Execute a multi-level task """ self.log(u"Executing multi level task...") self.log(u"Saving rconf...") # save original rconf orig_rconf = self.rconf.clone() # clone rconfs and set granularity # TODO the following code assumes 3 levels: generalize this level_rconfs = [None, self.rconf.clone(), self.rconf.clone(), self.rconf.clone()] level_mfccs = [None, None, None, None] force_aba_autos = [None, False, False, True] for i in range(1, len(level_rconfs)): level_rconfs[i].set_granularity(i) self.log([u"Level %d mmn: %s", i, level_rconfs[i].mmn]) self.log([u"Level %d mwl: %.3f", i, level_rconfs[i].mwl]) self.log([u"Level %d mws: %.3f", i, level_rconfs[i].mws]) level_rconfs[i].set_tts(i) self.log([u"Level %d tts: %s", i, level_rconfs[i].tts]) self.log([u"Level %d tts_path: %s", i, level_rconfs[i].tts_path]) self.log(u"Saving rconf... done") try: self.log(u"Creating AudioFile object...") audio_file = self._load_audio_file() self.log(u"Creating AudioFile object... done") # extract MFCC for each level for i in range(1, len(level_rconfs)): self._step_begin(u"extract MFCC real wave level %d" % i) if (i == 1) or (level_rconfs[i].mws != level_rconfs[i - 1].mws) or (level_rconfs[i].mwl != level_rconfs[i - 1].mwl): self.rconf = level_rconfs[i] level_mfccs[i] = self._extract_mfcc(audio_file=audio_file) else: self.log(u"Keeping MFCC real wave from previous level") level_mfccs[i] = level_mfccs[i - 1] self._step_end() self.log(u"Clearing AudioFile object...") self.rconf = level_rconfs[1] self._clear_audio_file(audio_file) self.log(u"Clearing AudioFile object... done") # compute head tail for the entire real wave (level 1) self._step_begin(u"compute head tail") (head_length, process_length, tail_length) = self._compute_head_process_tail(level_mfccs[1]) level_mfccs[1].set_head_middle_tail(head_length, process_length, tail_length) self._step_end() # compute alignment at each level sync_root = Tree() sync_roots = [sync_root] text_files = [self.task.text_file] number_levels = len(level_rconfs) for i in range(1, number_levels): self._step_begin(u"compute alignment level %d" % i) self.rconf = level_rconfs[i] text_files, sync_roots = self._execute_level( level=i, audio_file_mfcc=level_mfccs[i], text_files=text_files, sync_roots=sync_roots, force_aba_auto=force_aba_autos[i], ) self._step_end() # restore original rconf, and create syncmap and add it to task self._step_begin(u"create sync map") self.rconf = orig_rconf self._create_sync_map(sync_root=sync_root) self._step_end() self._step_total() self.log(u"Executing multi level task... done") except Exception as exc: self._step_failure(exc) def _execute_level(self, level, audio_file_mfcc, text_files, sync_roots, force_aba_auto=False): """ Compute the alignment for all the nodes in the given level. Return a pair (next_level_text_files, next_level_sync_roots), containing two lists of text file subtrees and sync map subtrees on the next level. :param int level: the level :param audio_file_mfcc: the audio MFCC representation for this level :type audio_file_mfcc: :class:`~aeneas.audiofilemfcc.AudioFileMFCC` :param list text_files: a list of :class:`~aeneas.textfile.TextFile` objects, each representing a (sub)tree of the Task text file :param list sync_roots: a list of :class:`~aeneas.tree.Tree` objects, each representing a SyncMapFragment tree, one for each element in ``text_files`` :param bool force_aba_auto: if ``True``, force using the AUTO ABA algorithm :rtype: (list, list) """ self._set_synthesizer() next_level_text_files = [] next_level_sync_roots = [] for text_file_index, text_file in enumerate(text_files): self.log([u"Text level %d, fragment %d", level, text_file_index]) self.log([u" Len: %d", len(text_file)]) sync_root = sync_roots[text_file_index] if (level > 1) and (len(text_file) == 1): self.log(u"Level > 1 and only one text fragment => return trivial tree") self._append_trivial_tree(text_file, sync_root) elif (level > 1) and (sync_root.value.begin == sync_root.value.end): self.log(u"Level > 1 and parent has begin == end => return trivial tree") self._append_trivial_tree(text_file, sync_root) else: self.log(u"Level == 1 or more than one text fragment with non-zero parent => compute tree") if not sync_root.is_empty: begin = sync_root.value.begin end = sync_root.value.end self.log([u" Setting begin: %.3f", begin]) self.log([u" Setting end: %.3f", end]) audio_file_mfcc.set_head_middle_tail(head_length=begin, middle_length=(end - begin)) else: self.log(u" No begin or end to set") self._execute_inner( audio_file_mfcc, text_file, sync_root=sync_root, force_aba_auto=force_aba_auto, log=False, leaf_level=(level == 3) ) # store next level roots next_level_text_files.extend(text_file.children_not_empty) # we added head and tail, we must not pass them to the next level next_level_sync_roots.extend(sync_root.children[1:-1]) self._clear_cache_synthesizer() return (next_level_text_files, next_level_sync_roots) def _execute_inner(self, audio_file_mfcc, text_file, sync_root=None, force_aba_auto=False, log=True, leaf_level=False): """ Align a subinterval of the given AudioFileMFCC with the given TextFile. Return the computed tree of time intervals, rooted at ``sync_root`` if the latter is not ``None``, or as a new ``Tree`` otherwise. The begin and end positions inside the AudioFileMFCC must have been set ahead by the caller. The text fragments being aligned are the vchildren of ``text_file``. :param audio_file_mfcc: the audio file MFCC representation :type audio_file_mfcc: :class:`~aeneas.audiofilemfcc.AudioFileMFCC` :param text_file: the text file subtree to align :type text_file: :class:`~aeneas.textfile.TextFile` :param sync_root: the tree node to which fragments should be appended :type sync_root: :class:`~aeneas.tree.Tree` :param bool force_aba_auto: if ``True``, do not run aba algorithm :param bool log: if ``True``, log steps :param bool leaf_level: alert aba if the computation is at a leaf level :rtype: :class:`~aeneas.tree.Tree` """ self._step_begin(u"synthesize text", log=log) synt_handler, synt_path, synt_anchors, synt_format = self._synthesize(text_file) self._step_end(log=log) self._step_begin(u"extract MFCC synt wave", log=log) synt_wave_mfcc = self._extract_mfcc( file_path=synt_path, file_format=synt_format, ) gf.delete_file(synt_handler, synt_path) self._step_end(log=log) self._step_begin(u"align waves", log=log) indices = self._align_waves(audio_file_mfcc, synt_wave_mfcc, synt_anchors) self._step_end(log=log) self._step_begin(u"adjust boundaries", log=log) self._adjust_boundaries(indices, text_file, audio_file_mfcc, sync_root, force_aba_auto, leaf_level) self._step_end(log=log) def _load_audio_file(self): """ Load audio in memory. :rtype: :class:`~aeneas.audiofile.AudioFile` """ self._step_begin(u"load audio file") # NOTE file_format=None forces conversion to # PCM16 mono WAVE with default sample rate audio_file = AudioFile( file_path=self.task.audio_file_path_absolute, file_format=None, rconf=self.rconf, logger=self.logger ) audio_file.read_samples_from_file() self._step_end() return audio_file def _clear_audio_file(self, audio_file): """ Clear audio from memory. :param audio_file: the object to clear :type audio_file: :class:`~aeneas.audiofile.AudioFile` """ self._step_begin(u"clear audio file") audio_file.clear_data() audio_file = None self._step_end() def _extract_mfcc(self, file_path=None, file_format=None, audio_file=None): """ Extract the MFCCs from the given audio file. :rtype: :class:`~aeneas.audiofilemfcc.AudioFileMFCC` """ audio_file_mfcc = AudioFileMFCC( file_path=file_path, file_format=file_format, audio_file=audio_file, rconf=self.rconf, logger=self.logger ) if self.rconf.mmn: self.log(u"Running VAD inside _extract_mfcc...") audio_file_mfcc.run_vad( log_energy_threshold=self.rconf[RuntimeConfiguration.MFCC_MASK_LOG_ENERGY_THRESHOLD], min_nonspeech_length=self.rconf[RuntimeConfiguration.MFCC_MASK_MIN_NONSPEECH_LENGTH], extend_before=self.rconf[RuntimeConfiguration.MFCC_MASK_EXTEND_SPEECH_INTERVAL_BEFORE], extend_after=self.rconf[RuntimeConfiguration.MFCC_MASK_EXTEND_SPEECH_INTERVAL_AFTER] ) self.log(u"Running VAD inside _extract_mfcc... done") return audio_file_mfcc def _compute_head_process_tail(self, audio_file_mfcc): """ Set the audio file head or tail, by either reading the explicit values from the Task configuration, or using SD to determine them. This function returns the lengths, in seconds, of the (head, process, tail). :rtype: tuple (float, float, float) """ head_length = self.task.configuration["i_a_head"] process_length = self.task.configuration["i_a_process"] tail_length = self.task.configuration["i_a_tail"] head_max = self.task.configuration["i_a_head_max"] head_min = self.task.configuration["i_a_head_min"] tail_max = self.task.configuration["i_a_tail_max"] tail_min = self.task.configuration["i_a_tail_min"] if ( (head_length is not None) or (process_length is not None) or (tail_length is not None) ): self.log(u"Setting explicit head process tail") else: self.log(u"Detecting head tail...") sd = SD(audio_file_mfcc, self.task.text_file, rconf=self.rconf, logger=self.logger) head_length = TimeValue("0.000") process_length = None tail_length = TimeValue("0.000") if (head_min is not None) or (head_max is not None): self.log(u"Detecting HEAD...") head_length = sd.detect_head(head_min, head_max) self.log([u"Detected HEAD: %.3f", head_length]) self.log(u"Detecting HEAD... done") if (tail_min is not None) or (tail_max is not None): self.log(u"Detecting TAIL...") tail_length = sd.detect_tail(tail_min, tail_max) self.log([u"Detected TAIL: %.3f", tail_length]) self.log(u"Detecting TAIL... done") self.log(u"Detecting head tail... done") self.log([u"Head: %s", gf.safe_float(head_length, None)]) self.log([u"Process: %s", gf.safe_float(process_length, None)]) self.log([u"Tail: %s", gf.safe_float(tail_length, None)]) return (head_length, process_length, tail_length) def _set_synthesizer(self): """ Create synthesizer """ self.log(u"Setting synthesizer...") self.synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger) self.log(u"Setting synthesizer... done") def _clear_cache_synthesizer(self): """ Clear the cache of the synthesizer """ self.log(u"Clearing synthesizer...") self.synthesizer.clear_cache() self.log(u"Clearing synthesizer... done") def _synthesize(self, text_file): """ Synthesize text into a WAVE file. Return a tuple consisting of: 1. the handler of the generated audio file 2. the path of the generated audio file 3. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` 4. a tuple describing the format of the audio file :param text_file: the text to be synthesized :type text_file: :class:`~aeneas.textfile.TextFile` :rtype: tuple (handler, string, list) """ handler, path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) result = self.synthesizer.synthesize(text_file, path) return (handler, path, result[0], self.synthesizer.output_audio_format) def _align_waves(self, real_wave_mfcc, synt_wave_mfcc, synt_anchors): """ Align two AudioFileMFCC objects, representing WAVE files. Return a list of boundary indices. """ self.log(u"Creating DTWAligner...") aligner = DTWAligner( real_wave_mfcc, synt_wave_mfcc, rconf=self.rconf, logger=self.logger ) self.log(u"Creating DTWAligner... done") self.log(u"Computing boundary indices...") boundary_indices = aligner.compute_boundaries(synt_anchors) self.log(u"Computing boundary indices... done") return boundary_indices def _adjust_boundaries(self, boundary_indices, text_file, real_wave_mfcc, sync_root, force_aba_auto=False, leaf_level=False): """ Adjust boundaries as requested by the user. Return the computed time map, that is, a list of pairs ``[start_time, end_time]``, of length equal to number of fragments + 2, where the two extra elements are for the HEAD (first) and TAIL (last). """ # boundary_indices contains the boundary indices in the all_mfcc of real_wave_mfcc # starting with the (head-1st fragment) and ending with (-1th fragment-tail) aba_parameters = self.task.configuration.aba_parameters() if force_aba_auto: self.log(u"Forced running algorithm: 'auto'") aba_parameters["algorithm"] = (AdjustBoundaryAlgorithm.AUTO, []) # note that the other aba settings (nonspeech and nozero) # remain as specified by the user self.log([u"ABA parameters: %s", aba_parameters]) aba = AdjustBoundaryAlgorithm(rconf=self.rconf, logger=self.logger) aba.adjust( aba_parameters=aba_parameters, real_wave_mfcc=real_wave_mfcc, boundary_indices=boundary_indices, text_file=text_file, allow_arbitrary_shift=leaf_level ) aba.append_fragment_list_to_sync_root(sync_root=sync_root) def _append_trivial_tree(self, text_file, sync_root): """ Append trivial tree, made by one HEAD, one sync map fragment for each element of ``text_file``, and one TAIL. This function is called if either ``text_file`` has only one element, or if ``sync_root.value`` is an interval with zero length (i.e., ``sync_root.value.begin == sync_root.value.end``). """ interval = sync_root.value # # NOTE the following is correct, but it is a bit obscure # time_values = [interval.begin] * (1 + len(text_file)) + [interval.end] * 2 # if len(text_file) == 1: time_values = [interval.begin, interval.begin, interval.end, interval.end] else: # interval.begin == interval.end time_values = [interval.begin] * (3 + len(text_file)) aba = AdjustBoundaryAlgorithm(rconf=self.rconf, logger=self.logger) aba.intervals_to_fragment_list( text_file=text_file, time_values=time_values ) aba.append_fragment_list_to_sync_root(sync_root=sync_root) def _create_sync_map(self, sync_root): """ If requested, check that the computed sync map is consistent. Then, add it to the Task. """ sync_map = SyncMap(tree=sync_root, rconf=self.rconf, logger=self.logger) if self.rconf.safety_checks: self.log(u"Running sanity check on computed sync map...") if not sync_map.leaves_are_consistent: self._step_failure(ValueError(u"The computed sync map contains inconsistent fragments")) self.log(u"Running sanity check on computed sync map... passed") else: self.log(u"Not running sanity check on computed sync map") self.task.sync_map = sync_map
def perform_command(self): """ Perform command and return the appropriate exit code. :rtype: int """ if len(self.actual_arguments) < 4: return self.print_help() text_format = gf.safe_unicode(self.actual_arguments[0]) if text_format == u"list": text = gf.safe_unicode(self.actual_arguments[1]) elif text_format in TextFileFormat.ALLOWED_VALUES: text = self.actual_arguments[1] if not self.check_input_file(text): return self.ERROR_EXIT_CODE else: return self.print_help() l1_id_regex = self.has_option_with_value(u"--l1-id-regex") l2_id_regex = self.has_option_with_value(u"--l2-id-regex") l3_id_regex = self.has_option_with_value(u"--l3-id-regex") id_regex = self.has_option_with_value(u"--id-regex") class_regex = self.has_option_with_value(u"--class-regex") sort = self.has_option_with_value(u"--sort") backwards = self.has_option([u"-b", u"--backwards"]) quit_after = gf.safe_float(self.has_option_with_value(u"--quit-after"), None) start_fragment = gf.safe_int(self.has_option_with_value(u"--start"), None) end_fragment = gf.safe_int(self.has_option_with_value(u"--end"), None) parameters = { gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX: l1_id_regex, gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX: l2_id_regex, gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX: l3_id_regex, gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX: class_regex, gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: id_regex, gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: sort, } if (text_format == TextFileFormat.MUNPARSED) and ( (l1_id_regex is None) or (l2_id_regex is None) or (l3_id_regex is None)): self.print_error( u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format" ) return self.ERROR_EXIT_CODE if (text_format == TextFileFormat.UNPARSED) and ( id_regex is None) and (class_regex is None): self.print_error( u"You must specify --id-regex and/or --class-regex for unparsed format" ) return self.ERROR_EXIT_CODE language = gf.safe_unicode(self.actual_arguments[2]) output_file_path = self.actual_arguments[3] if not self.check_output_file(output_file_path): return self.ERROR_EXIT_CODE text_file = self.get_text_file(text_format, text, parameters) if text_file is None: self.print_error( u"Unable to build a TextFile from the given parameters") return self.ERROR_EXIT_CODE elif len(text_file) == 0: self.print_error(u"No text fragments found") return self.ERROR_EXIT_CODE text_file.set_language(language) self.print_info(u"Read input text with %d fragments" % (len(text_file))) if start_fragment is not None: self.print_info(u"Slicing from index %d" % (start_fragment)) if end_fragment is not None: self.print_info(u"Slicing to index %d" % (end_fragment)) text_slice = text_file.get_slice(start_fragment, end_fragment) self.print_info(u"Synthesizing %d fragments" % (len(text_slice))) if quit_after is not None: self.print_info(u"Stop synthesizing upon reaching %.3f seconds" % (quit_after)) try: synt = Synthesizer(rconf=self.rconf, logger=self.logger) synt.synthesize(text_slice, output_file_path, quit_after=quit_after, backwards=backwards) self.print_success(u"Created file '%s'" % output_file_path) synt.clear_cache() return self.NO_ERROR_EXIT_CODE except ImportError as exc: tts = self.rconf[RuntimeConfiguration.TTS] if tts == Synthesizer.AWS: self.print_error( u"You need to install Python module boto3 to use the AWS Polly TTS API wrapper. Run:" ) self.print_error(u"$ pip install boto3") self.print_error(u"or, to install for all users:") self.print_error(u"$ sudo pip install boto3") elif tts == Synthesizer.NUANCE: self.print_error( u"You need to install Python module requests to use the Nuance TTS API wrapper. Run:" ) self.print_error(u"$ pip install requests") self.print_error(u"or, to install for all users:") self.print_error(u"$ sudo pip install requests") else: self.print_error( u"An unexpected error occurred while synthesizing text:") self.print_error(u"%s" % exc) except Exception as exc: self.print_error( u"An unexpected error occurred while synthesizing text:") self.print_error(u"%s" % exc) return self.ERROR_EXIT_CODE
def perform_command(self): """ Perform command and return the appropriate exit code. :rtype: int """ if len(self.actual_arguments) < 4: return self.print_help() text_format = gf.safe_unicode(self.actual_arguments[0]) if text_format == u"list": text = gf.safe_unicode(self.actual_arguments[1]) elif text_format in TextFileFormat.ALLOWED_VALUES: text = self.actual_arguments[1] if not self.check_input_file(text): return self.ERROR_EXIT_CODE else: return self.print_help() l1_id_regex = self.has_option_with_value(u"--l1-id-regex") l2_id_regex = self.has_option_with_value(u"--l2-id-regex") l3_id_regex = self.has_option_with_value(u"--l3-id-regex") id_regex = self.has_option_with_value(u"--id-regex") class_regex = self.has_option_with_value(u"--class-regex") sort = self.has_option_with_value(u"--sort") backwards = self.has_option([u"-b", u"--backwards"]) quit_after = gf.safe_float(self.has_option_with_value(u"--quit-after"), None) start_fragment = gf.safe_int(self.has_option_with_value(u"--start"), None) end_fragment = gf.safe_int(self.has_option_with_value(u"--end"), None) parameters = { gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX: l1_id_regex, gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX: l2_id_regex, gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX: l3_id_regex, gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX: class_regex, gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: id_regex, gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: sort, } if (text_format == TextFileFormat.MUNPARSED) and ((l1_id_regex is None) or (l2_id_regex is None) or (l3_id_regex is None)): self.print_error(u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format") return self.ERROR_EXIT_CODE if (text_format == TextFileFormat.UNPARSED) and (id_regex is None) and (class_regex is None): self.print_error(u"You must specify --id-regex and/or --class-regex for unparsed format") return self.ERROR_EXIT_CODE language = gf.safe_unicode(self.actual_arguments[2]) output_file_path = self.actual_arguments[3] if not self.check_output_file(output_file_path): return self.ERROR_EXIT_CODE text_file = self.get_text_file(text_format, text, parameters) if text_file is None: self.print_error(u"Unable to build a TextFile from the given parameters") return self.ERROR_EXIT_CODE elif len(text_file) == 0: self.print_error(u"No text fragments found") return self.ERROR_EXIT_CODE text_file.set_language(language) self.print_info(u"Read input text with %d fragments" % (len(text_file))) if start_fragment is not None: self.print_info(u"Slicing from index %d" % (start_fragment)) if end_fragment is not None: self.print_info(u"Slicing to index %d" % (end_fragment)) text_slice = text_file.get_slice(start_fragment, end_fragment) self.print_info(u"Synthesizing %d fragments" % (len(text_slice))) if quit_after is not None: self.print_info(u"Stop synthesizing upon reaching %.3f seconds" % (quit_after)) try: synt = Synthesizer(rconf=self.rconf, logger=self.logger) synt.synthesize( text_slice, output_file_path, quit_after=quit_after, backwards=backwards ) self.print_success(u"Created file '%s'" % output_file_path) synt.clear_cache() return self.NO_ERROR_EXIT_CODE except ImportError as exc: tts = self.rconf[RuntimeConfiguration.TTS] if tts == Synthesizer.AWS: self.print_error(u"You need to install Python module boto3 to use the AWS Polly TTS API wrapper. Run:") self.print_error(u"$ pip install boto3") self.print_error(u"or, to install for all users:") self.print_error(u"$ sudo pip install boto3") elif tts == Synthesizer.NUANCE: self.print_error(u"You need to install Python module requests to use the Nuance TTS API wrapper. Run:") self.print_error(u"$ pip install requests") self.print_error(u"or, to install for all users:") self.print_error(u"$ sudo pip install requests") else: self.print_error(u"An unexpected error occurred while synthesizing text:") self.print_error(u"%s" % exc) except Exception as exc: self.print_error(u"An unexpected error occurred while synthesizing text:") self.print_error(u"%s" % exc) return self.ERROR_EXIT_CODE
def _detect_start(self, min_start_length, max_start_length, metric, backwards=False): """ Detect start """ self._log(["Min start length: %.3f", min_start_length]) self._log(["Max start length: %.3f", max_start_length]) self._log(["Metric: %s", metric]) self._log(["Backwards: %s", str(backwards)]) audio_rate = self.text_file.characters / self.audio_file.audio_length self._log(["Audio rate: %.3f", audio_rate]) self._log("Synthesizing query...") tmp_handler, tmp_file_path = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) synt = Synthesizer(logger=self.logger) synt_duration = max_start_length * self.QUERY_FACTOR self._log(["Synthesizing %.3f seconds", synt_duration]) result = synt.synthesize( self.text_file, tmp_file_path, quit_after=synt_duration, backwards=backwards ) self._log("Synthesizing query... done") query_file = AudioFile(tmp_file_path) if backwards: self._log("Reversing query") query_file.reverse() self._log("Extracting MFCCs for query...") query_file.extract_mfcc(frame_rate=self.frame_rate) query_file.clear_data() self._log("Extracting MFCCs for query... done") self._log("Cleaning up...") self._cleanup(tmp_handler, tmp_file_path) self._log("Cleaning up... done") query_characters = result[2] query_len = query_file.audio_length query_mfcc = query_file.audio_mfcc query_rate = query_characters / query_len stretch_factor = max(1, query_rate / audio_rate) self._log(["Audio rate: %.3f", audio_rate]) self._log(["Query rate: %.3f", query_rate]) self._log(["Stretch factor: %.3f", stretch_factor]) audio_mfcc = self.audio_file.audio_mfcc self._log(["Actual audio has %d frames", audio_mfcc.shape[1]]) audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate) self._log(["Limiting audio to first %d frames", audio_mfcc_end_index]) audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1]) audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index] self._log(["Limited audio has %d frames", audio_mfcc.shape[1]]) l, o = audio_mfcc.shape l, n = query_mfcc.shape # minimum length of a matched interval in the real audio stretched_match_minimum_length = int(n * stretch_factor) self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)]) self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)]) self._log(["Stretch factor: %.3f", stretch_factor]) self._log(["Required minimum length: %.3f", stretched_match_minimum_length]) self._log("Speech intervals:") for interval in self.audio_speech: self._log([" %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]]) admissible_intervals = [x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length))] self._log("AdmissibleSpeech intervals:") for interval in admissible_intervals: self._log([" %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]]) candidates = [] runs_with_min_length = 0 runs_no_improvement = 0 runs_min_distortion = numpy.inf runs_min_value = numpy.inf for interval in admissible_intervals: if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT: self._log(" Breaking: too many runs without improvement") break if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH: self._log(" Breaking: too many runs with minimum required length") break start_time = interval[0] start_index = self._t2i(start_time) self._log(["Evaluating interval starting at %d == %.3f ", start_index, start_time]) if start_index > o: self._log(" Breaking: start index outside audio window") break req_end_index = start_index + stretched_match_minimum_length req_end_time = self._i2t(req_end_index) if req_end_index > o: self._log(" Breaking: not enough audio left in shifted window") break end_index = min(start_index + 2 * n, o) end_time = self._i2t(end_index) self._log([" Start %d == %.3f", start_index, start_time]) self._log([" Req end %d == %.3f", req_end_index, req_end_time]) self._log([" Eff end %d == %.3f", end_index, end_time]) audio_mfcc_sub = audio_mfcc[:, start_index:end_index] l, m = audio_mfcc_sub.shape self._log("Computing DTW...") aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger) aligner.real_wave_full_mfcc = audio_mfcc_sub aligner.synt_wave_full_mfcc = query_mfcc aligner.real_wave_length = self._i2t(m) aligner.synt_wave_length = self._i2t(n) acm = aligner.compute_accumulated_cost_matrix() # transpose, so we have an n x m accumulated cost matrix acm = acm.transpose() last_row = acm[-1, :] self._log("Computing DTW... done") # find the minimum, but its index must be >= stretched_match_minimum_length candidate_argmin_index = numpy.argmin(last_row[stretched_match_minimum_length:]) candidate_length_index = stretched_match_minimum_length + candidate_argmin_index candidate_length_time = self._i2t(candidate_length_index) candidate_value = last_row[candidate_length_index] candidate_end_index = start_index + candidate_length_index candidate_end_time = self._i2t(candidate_end_index) candidate_distortion = candidate_value / candidate_length_index # check if the candidate has minimum length if candidate_length_index == stretched_match_minimum_length: runs_with_min_length += 1 else: runs_with_min_length = 0 # check if the candidate improved the global minimum value if metric == SDMetric.VALUE: if candidate_value < runs_min_value: runs_min_value = candidate_value runs_no_improvement = 0 else: runs_no_improvement += 1 if metric == SDMetric.DISTORTION: if candidate_distortion < runs_min_distortion: runs_min_distortion = candidate_distortion runs_no_improvement = 0 else: runs_no_improvement += 1 # append to the list of candidates self._log([" Interval start: %d == %.6f", start_index, start_time]) self._log([" Interval end: %d == %.6f", end_index, end_time]) self._log([" Candidate start: %d == %.6f", start_index, start_time]) self._log([" Candidate end: %d == %.6f", candidate_end_index, candidate_end_time]) self._log([" Candidate length: %d == %.6f", candidate_length_index, candidate_length_time]) self._log([" Candidate value: %.6f", candidate_value]) self._log([" Candidate distortion: %.6f", candidate_distortion]) candidates.append({ "start_index": start_index, "length": candidate_length_index, "value": candidate_value, "distortion": candidate_distortion }) # select best candidate and return its start time # if we have no best candidate, return 0.0 best_candidate = self._select_best_candidate(candidates, metric) if best_candidate is None: return 0.0 sd_time = self._i2t(max(best_candidate["start_index"], 0)) self._log(["Returning time %.3f", sd_time]) return sd_time
def main(): """ Entry point """ if len(sys.argv) < 5: usage() return language = sys.argv[1] text_file_path = sys.argv[2] text_format = sys.argv[3] audio_file_path = sys.argv[-1] backwards = False quit_after = None parameters = {} for i in range(4, len(sys.argv) - 1): args = sys.argv[i].split("=") if len(args) == 1: backwards = (args[0] in ["b", "-b", "backwards", "--backwards"]) if len(args) == 2: key, value = args if key == "id_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX] = value if key == "class_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX] = value if key == "sort": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT] = value if (key == "start") or (key == "end"): try: parameters[key] = int(value) except: pass if key == "quit_after": quit_after = float(value) if text_format == "list": text_file = TextFile() text_file.read_from_list(text_file_path.split("|")) else: text_file = TextFile(text_file_path, text_format, parameters) text_file.set_language(language) start_fragment = None if "start" in parameters: start_fragment = parameters["start"] end_fragment = None if "end" in parameters: end_fragment = parameters["end"] print "[INFO] Read input text file with %d fragments" % (len(text_file)) if start_fragment is not None: print "[INFO] Slicing from index %d" % (start_fragment) if end_fragment is not None: print "[INFO] Slicing to index %d" % (end_fragment) text_slice = text_file.get_slice(start_fragment, end_fragment) print "[INFO] Synthesizing %d fragments" % (len(text_slice)) if quit_after is not None: print "[INFO] Stop synthesizing after reaching %.3f seconds" % ( quit_after) if backwards: print "[INFO] Synthesizing backwards" synt = Synthesizer() synt.synthesize(text_slice, audio_file_path, quit_after, backwards) print "[INFO] Created file '%s'" % audio_file_path
def test_synthesize_path_not_writeable(self): tfl = TextFile() synth = Synthesizer() with self.assertRaises(OSError): synth.synthesize(tfl, self.PATH_NOT_WRITEABLE)
def test_synthesize_invalid_text_file(self): synth = Synthesizer() with self.assertRaises(TypeError): synth.synthesize("foo", self.PATH_NOT_WRITEABLE)
def _detect(self, min_length, max_length, tail=False): """ Detect the head or tail within ``min_length`` and ``max_length`` duration. If detecting the tail, the real wave MFCC and the query are reversed so that the tail detection problem reduces to a head detection problem. Return the duration of the head or tail, in seconds. :param min_length: estimated minimum length :type min_length: :class:`~aeneas.exacttiming.TimeValue` :param max_length: estimated maximum length :type max_length: :class:`~aeneas.exacttiming.TimeValue` :rtype: :class:`~aeneas.exacttiming.TimeValue` :raises: TypeError: if one of the parameters is not ``None`` or a number :raises: ValueError: if one of the parameters is negative """ def _sanitize(value, default, name): if value is None: value = default try: value = TimeValue(value) except (TypeError, ValueError, InvalidOperation) as exc: self.log_exc(u"The value of %s is not a number" % (name), exc, True, TypeError) if value < 0: self.log_exc(u"The value of %s is negative" % (name), None, True, ValueError) return value min_length = _sanitize(min_length, self.MIN_LENGTH, "min_length") max_length = _sanitize(max_length, self.MAX_LENGTH, "max_length") mws = self.rconf.mws min_length_frames = int(min_length / mws) max_length_frames = int(max_length / mws) self.log([u"MFCC window shift s: %.3f", mws]) self.log([u"Min start length s: %.3f", min_length]) self.log([u"Min start length frames: %d", min_length_frames]) self.log([u"Max start length s: %.3f", max_length]) self.log([u"Max start length frames: %d", max_length_frames]) self.log([u"Tail?: %s", str(tail)]) self.log(u"Synthesizing query...") synt_duration = max_length * self.QUERY_FACTOR self.log([u"Synthesizing at least %.3f seconds", synt_duration]) tmp_handler, tmp_file_path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) synt = Synthesizer(rconf=self.rconf, logger=self.logger) anchors, total_time, synthesized_chars = synt.synthesize( self.text_file, tmp_file_path, quit_after=synt_duration, backwards=tail ) self.log(u"Synthesizing query... done") self.log(u"Extracting MFCCs for query...") query_mfcc = AudioFileMFCC(tmp_file_path, rconf=self.rconf, logger=self.logger) self.log(u"Extracting MFCCs for query... done") self.log(u"Cleaning up...") gf.delete_file(tmp_handler, tmp_file_path) self.log(u"Cleaning up... done") search_window = max_length * self.AUDIO_FACTOR search_window_end = min(int(search_window / mws), self.real_wave_mfcc.all_length) self.log([u"Query MFCC length (frames): %d", query_mfcc.all_length]) self.log([u"Real MFCC length (frames): %d", self.real_wave_mfcc.all_length]) self.log([u"Search window end (s): %.3f", search_window]) self.log([u"Search window end (frames): %d", search_window_end]) if tail: self.log(u"Tail => reversing real_wave_mfcc and query_mfcc") self.real_wave_mfcc.reverse() query_mfcc.reverse() # NOTE: VAD will be run here, if not done before speech_intervals = self.real_wave_mfcc.intervals(speech=True, time=False) if len(speech_intervals) < 1: self.log(u"No speech intervals, hence no start found") if tail: self.real_wave_mfcc.reverse() return TimeValue("0.000") # generate a list of begin indices search_end = None candidates_begin = [] for interval in speech_intervals: if (interval[0] >= min_length_frames) and (interval[0] <= max_length_frames): candidates_begin.append(interval[0]) search_end = interval[1] if search_end >= search_window_end: break # for each begin index, compute the acm cost # to match the query # note that we take the min over the last column of the acm # meaning that we allow to match the entire query wave # against a portion of the real wave candidates = [] for candidate_begin in candidates_begin: self.log([u"Candidate interval starting at %d == %.3f", candidate_begin, candidate_begin * mws]) try: rwm = AudioFileMFCC( mfcc_matrix=self.real_wave_mfcc.all_mfcc[:, candidate_begin:search_end], rconf=self.rconf, logger=self.logger ) dtw = DTWAligner( real_wave_mfcc=rwm, synt_wave_mfcc=query_mfcc, rconf=self.rconf, logger=self.logger ) acm = dtw.compute_accumulated_cost_matrix() last_column = acm[:, -1] min_value = numpy.min(last_column) min_index = numpy.argmin(last_column) self.log([u"Candidate interval: %d %d == %.3f %.3f", candidate_begin, search_end, candidate_begin * mws, search_end * mws]) self.log([u" Min value: %.6f", min_value]) self.log([u" Min index: %d == %.3f", min_index, min_index * mws]) candidates.append((min_value, candidate_begin, min_index)) except Exception as exc: self.log_exc(u"An unexpected error occurred while running _detect", exc, False, None) # reverse again the real wave if tail: self.log(u"Tail => reversing real_wave_mfcc again") self.real_wave_mfcc.reverse() # return if len(candidates) < 1: self.log(u"No candidates found") return TimeValue("0.000") self.log(u"Candidates:") for candidate in candidates: self.log([u" Value: %.6f Begin Time: %.3f Min Index: %d", candidate[0], candidate[1] * mws, candidate[2]]) best = sorted(candidates)[0][1] self.log([u"Best candidate: %d == %.3f", best, best * mws]) return best * mws
def _detect_start(self, min_start_length, max_start_length, metric, backwards=False): """ Detect start """ self._log(["Min start length: %.3f", min_start_length]) self._log(["Max start length: %.3f", max_start_length]) self._log(["Metric: %s", metric]) self._log(["Backwards: %s", str(backwards)]) audio_rate = self.text_file.characters / self.audio_file.audio_length self._log(["Audio rate: %.3f", audio_rate]) self._log("Synthesizing query...") tmp_handler, tmp_file_path = tempfile.mkstemp(suffix=".wav", dir=gf.custom_tmp_dir()) synt = Synthesizer(logger=self.logger) synt_duration = max_start_length * self.QUERY_FACTOR self._log(["Synthesizing %.3f seconds", synt_duration]) result = synt.synthesize(self.text_file, tmp_file_path, quit_after=synt_duration, backwards=backwards) self._log("Synthesizing query... done") query_file = AudioFile(tmp_file_path) if backwards: self._log("Reversing query") query_file.reverse() self._log("Extracting MFCCs for query...") query_file.extract_mfcc(frame_rate=self.frame_rate) query_file.clear_data() self._log("Extracting MFCCs for query... done") self._log("Cleaning up...") self._cleanup(tmp_handler, tmp_file_path) self._log("Cleaning up... done") query_characters = result[2] query_len = query_file.audio_length query_mfcc = query_file.audio_mfcc query_rate = query_characters / query_len stretch_factor = max(1, query_rate / audio_rate) self._log(["Audio rate: %.3f", audio_rate]) self._log(["Query rate: %.3f", query_rate]) self._log(["Stretch factor: %.3f", stretch_factor]) audio_mfcc = self.audio_file.audio_mfcc self._log(["Actual audio has %d frames", audio_mfcc.shape[1]]) audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate) self._log(["Limiting audio to first %d frames", audio_mfcc_end_index]) audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1]) audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index] self._log(["Limited audio has %d frames", audio_mfcc.shape[1]]) l, o = audio_mfcc.shape l, n = query_mfcc.shape # minimum length of a matched interval in the real audio stretched_match_minimum_length = int(n * stretch_factor) self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)]) self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)]) self._log(["Stretch factor: %.3f", stretch_factor]) self._log( ["Required minimum length: %.3f", stretched_match_minimum_length]) self._log("Speech intervals:") for interval in self.audio_speech: self._log([ " %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1] ]) admissible_intervals = [ x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length)) ] self._log("AdmissibleSpeech intervals:") for interval in admissible_intervals: self._log([ " %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1] ]) candidates = [] runs_with_min_length = 0 runs_no_improvement = 0 runs_min_distortion = numpy.inf runs_min_value = numpy.inf for interval in admissible_intervals: if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT: self._log(" Breaking: too many runs without improvement") break if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH: self._log( " Breaking: too many runs with minimum required length") break start_time = interval[0] start_index = self._t2i(start_time) self._log([ "Evaluating interval starting at %d == %.3f ", start_index, start_time ]) if start_index > o: self._log(" Breaking: start index outside audio window") break req_end_index = start_index + stretched_match_minimum_length req_end_time = self._i2t(req_end_index) if req_end_index > o: self._log( " Breaking: not enough audio left in shifted window") break end_index = min(start_index + 2 * n, o) end_time = self._i2t(end_index) self._log([" Start %d == %.3f", start_index, start_time]) self._log([" Req end %d == %.3f", req_end_index, req_end_time]) self._log([" Eff end %d == %.3f", end_index, end_time]) audio_mfcc_sub = audio_mfcc[:, start_index:end_index] l, m = audio_mfcc_sub.shape self._log("Computing DTW...") aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger) aligner.real_wave_full_mfcc = audio_mfcc_sub aligner.synt_wave_full_mfcc = query_mfcc aligner.real_wave_length = self._i2t(m) aligner.synt_wave_length = self._i2t(n) acm = aligner.compute_accumulated_cost_matrix() # transpose, so we have an n x m accumulated cost matrix acm = acm.transpose() last_row = acm[-1, :] self._log("Computing DTW... done") # find the minimum, but its index must be >= stretched_match_minimum_length candidate_argmin_index = numpy.argmin( last_row[stretched_match_minimum_length:]) candidate_length_index = stretched_match_minimum_length + candidate_argmin_index candidate_length_time = self._i2t(candidate_length_index) candidate_value = last_row[candidate_length_index] candidate_end_index = start_index + candidate_length_index candidate_end_time = self._i2t(candidate_end_index) candidate_distortion = candidate_value / candidate_length_index # check if the candidate has minimum length if candidate_length_index == stretched_match_minimum_length: runs_with_min_length += 1 else: runs_with_min_length = 0 # check if the candidate improved the global minimum value if metric == SDMetric.VALUE: if candidate_value < runs_min_value: runs_min_value = candidate_value runs_no_improvement = 0 else: runs_no_improvement += 1 if metric == SDMetric.DISTORTION: if candidate_distortion < runs_min_distortion: runs_min_distortion = candidate_distortion runs_no_improvement = 0 else: runs_no_improvement += 1 # append to the list of candidates self._log([ " Interval start: %d == %.6f", start_index, start_time ]) self._log( [" Interval end: %d == %.6f", end_index, end_time]) self._log([ " Candidate start: %d == %.6f", start_index, start_time ]) self._log([ " Candidate end: %d == %.6f", candidate_end_index, candidate_end_time ]) self._log([ " Candidate length: %d == %.6f", candidate_length_index, candidate_length_time ]) self._log([" Candidate value: %.6f", candidate_value]) self._log([" Candidate distortion: %.6f", candidate_distortion]) candidates.append({ "start_index": start_index, "length": candidate_length_index, "value": candidate_value, "distortion": candidate_distortion }) # select best candidate and return its start time # if we have no best candidate, return 0.0 best_candidate = self._select_best_candidate(candidates, metric) if best_candidate is None: return 0.0 sd_time = self._i2t(max(best_candidate["start_index"], 0)) self._log(["Returning time %.3f", sd_time]) return sd_time
def _detect(self, min_length, max_length, tail=False): """ Detect the head or tail within ``min_length`` and ``max_length`` duration. If detecting the tail, the real wave MFCC and the query are reversed so that the tail detection problem reduces to a head detection problem. Return the duration of the head or tail, in seconds. :param min_length: estimated minimum length :type min_length: :class:`~aeneas.timevalue.TimeValue` :param max_length: estimated maximum length :type max_length: :class:`~aeneas.timevalue.TimeValue` :rtype: :class:`~aeneas.timevalue.TimeValue` :raises: TypeError: if one of the parameters is not ``None`` or a number :raises: ValueError: if one of the parameters is negative """ def _sanitize(value, default, name): if value is None: value = default try: value = TimeValue(value) except (TypeError, ValueError, InvalidOperation) as exc: self.log_exc(u"The value of %s is not a number" % (name), exc, True, TypeError) if value < 0: self.log_exc(u"The value of %s is negative" % (name), None, True, ValueError) return value min_length = _sanitize(min_length, self.MIN_LENGTH, "min_length") max_length = _sanitize(max_length, self.MAX_LENGTH, "max_length") mws = self.rconf.mws min_length_frames = int(min_length / mws) max_length_frames = int(max_length / mws) self.log([u"MFCC window shift s: %.3f", mws]) self.log([u"Min start length s: %.3f", min_length]) self.log([u"Min start length frames: %d", min_length_frames]) self.log([u"Max start length s: %.3f", max_length]) self.log([u"Max start length frames: %d", max_length_frames]) self.log([u"Tail?: %s", str(tail)]) self.log(u"Synthesizing query...") synt_duration = max_length * self.QUERY_FACTOR self.log([u"Synthesizing at least %.3f seconds", synt_duration]) tmp_handler, tmp_file_path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) synt = Synthesizer(rconf=self.rconf, logger=self.logger) anchors, total_time, synthesized_chars = synt.synthesize( self.text_file, tmp_file_path, quit_after=synt_duration, backwards=tail ) self.log(u"Synthesizing query... done") self.log(u"Extracting MFCCs for query...") query_mfcc = AudioFileMFCC(tmp_file_path, rconf=self.rconf, logger=self.logger) self.log(u"Extracting MFCCs for query... done") self.log(u"Cleaning up...") gf.delete_file(tmp_handler, tmp_file_path) self.log(u"Cleaning up... done") search_window = max_length * self.AUDIO_FACTOR search_window_end = min(int(search_window / mws), self.real_wave_mfcc.all_length) self.log([u"Query MFCC length (frames): %d", query_mfcc.all_length]) self.log([u"Real MFCC length (frames): %d", self.real_wave_mfcc.all_length]) self.log([u"Search window end (s): %.3f", search_window]) self.log([u"Search window end (frames): %d", search_window_end]) if tail: self.log(u"Tail => reversing real_wave_mfcc and query_mfcc") self.real_wave_mfcc.reverse() query_mfcc.reverse() # NOTE: VAD will be run here, if not done before speech_intervals = self.real_wave_mfcc.intervals(speech=True, time=False) if len(speech_intervals) < 1: self.log(u"No speech intervals, hence no start found") if tail: self.real_wave_mfcc.reverse() return TimeValue("0.000") # generate a list of begin indices search_end = None candidates_begin = [] for interval in speech_intervals: if (interval[0] >= min_length_frames) and (interval[0] <= max_length_frames): candidates_begin.append(interval[0]) search_end = interval[1] if search_end >= search_window_end: break # for each begin index, compute the acm cost # to match the query # note that we take the min over the last column of the acm # meaning that we allow to match the entire query wave # against a portion of the real wave candidates = [] for candidate_begin in candidates_begin: self.log([u"Candidate interval starting at %d == %.3f", candidate_begin, candidate_begin * mws]) try: rwm = AudioFileMFCC( mfcc_matrix=self.real_wave_mfcc.all_mfcc[:, candidate_begin:search_end], rconf=self.rconf, logger=self.logger ) dtw = DTWAligner( real_wave_mfcc=rwm, synt_wave_mfcc=query_mfcc, rconf=self.rconf, logger=self.logger ) acm = dtw.compute_accumulated_cost_matrix() last_column = acm[:, -1] min_value = numpy.min(last_column) min_index = numpy.argmin(last_column) self.log([u"Candidate interval: %d %d == %.3f %.3f", candidate_begin, search_end, candidate_begin * mws, search_end * mws]) self.log([u" Min value: %.6f", min_value]) self.log([u" Min index: %d == %.3f", min_index, min_index * mws]) candidates.append((min_value, candidate_begin, min_index)) except Exception as exc: self.log_exc(u"An unexpected error occurred while running _detect", exc, False, None) # reverse again the real wave if tail: self.log(u"Tail => reversing real_wave_mfcc again") self.real_wave_mfcc.reverse() # return if len(candidates) < 1: self.log(u"No candidates found") return TimeValue("0.000") self.log(u"Candidates:") for candidate in candidates: self.log([u" Value: %.6f Begin Time: %.3f Min Index: %d", candidate[0], candidate[1] * mws, candidate[2]]) best = sorted(candidates)[0][1] self.log([u"Best candidate: %d == %.3f", best, best * mws]) return best * mws
class ExecuteTask(Loggable): """ Execute a task, that is, compute the sync map for it. :param task: the task to be executed :type task: :class:`~aeneas.task.Task` :param rconf: a runtime configuration :type rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration` :param logger: the logger object :type logger: :class:`~aeneas.logger.Logger` """ TAG = u"ExecuteTask" def __init__(self, task=None, rconf=None, logger=None): super(ExecuteTask, self).__init__(rconf=rconf, logger=logger) self.task = task self.step_index = 1 self.step_label = u"" self.step_begin_time = None self.step_total = 0.000 self.synthesizer = None if task is not None: self.load_task(self.task) def load_task(self, task): """ Load the task from the given ``Task`` object. :param task: the task to load :type task: :class:`~aeneas.task.Task` :raises: :class:`~aeneas.executetask.ExecuteTaskInputError`: if ``task`` is not an instance of :class:`~aeneas.task.Task` """ if not isinstance(task, Task): self.log_exc(u"task is not an instance of Task", None, True, ExecuteTaskInputError) self.task = task def _step_begin(self, label, log=True): """ Log begin of a step """ if log: self.step_label = label self.step_begin_time = self.log(u"STEP %d BEGIN (%s)" % (self.step_index, label)) def _step_end(self, log=True): """ Log end of a step """ if log: step_end_time = self.log(u"STEP %d END (%s)" % (self.step_index, self.step_label)) diff = (step_end_time - self.step_begin_time) diff = float(diff.seconds + diff.microseconds / 1000000.0) self.step_total += diff self.log(u"STEP %d DURATION %.3f (%s)" % (self.step_index, diff, self.step_label)) self.step_index += 1 def _step_failure(self, exc): """ Log failure of a step """ self.log_crit(u"STEP %d (%s) FAILURE" % (self.step_index, self.step_label)) self.step_index += 1 self.log_exc(u"Unexpected error while executing task", exc, True, ExecuteTaskExecutionError) def _step_total(self): """ Log total """ self.log(u"STEP T DURATION %.3f" % (self.step_total)) def execute(self): """ Execute the task. The sync map produced will be stored inside the task object. :raises: :class:`~aeneas.executetask.ExecuteTaskInputError`: if there is a problem with the input parameters :raises: :class:`~aeneas.executetask.ExecuteTaskExecutionError`: if there is a problem during the task execution """ self.log(u"Executing task...") # check that we have the AudioFile object if self.task.audio_file is None: self.log_exc(u"The task does not seem to have its audio file set", None, True, ExecuteTaskInputError) if ((self.task.audio_file.audio_length is None) or (self.task.audio_file.audio_length <= 0)): self.log_exc(u"The task seems to have an invalid audio file", None, True, ExecuteTaskInputError) task_max_audio_length = self.rconf[ RuntimeConfiguration.TASK_MAX_AUDIO_LENGTH] if ((task_max_audio_length > 0) and (self.task.audio_file.audio_length > task_max_audio_length)): self.log_exc( u"The audio file of the task has length %.3f, more than the maximum allowed (%.3f)." % (self.task.audio_file.audio_length, task_max_audio_length), None, True, ExecuteTaskInputError) # check that we have the TextFile object if self.task.text_file is None: self.log_exc(u"The task does not seem to have its text file set", None, True, ExecuteTaskInputError) if len(self.task.text_file) == 0: self.log_exc(u"The task text file seems to have no text fragments", None, True, ExecuteTaskInputError) task_max_text_length = self.rconf[ RuntimeConfiguration.TASK_MAX_TEXT_LENGTH] if ((task_max_text_length > 0) and (len(self.task.text_file) > task_max_text_length)): self.log_exc( u"The text file of the task has %d fragments, more than the maximum allowed (%d)." % (len(self.task.text_file), task_max_text_length), None, True, ExecuteTaskInputError) if self.task.text_file.chars == 0: self.log_exc(u"The task text file seems to have empty text", None, True, ExecuteTaskInputError) self.log(u"Both audio and text input file are present") # execute self.step_index = 1 self.step_total = 0.000 if self.task.text_file.file_format in TextFileFormat.MULTILEVEL_VALUES: self._execute_multi_level_task() else: self._execute_single_level_task() self.log(u"Executing task... done") def _execute_single_level_task(self): """ Execute a single-level task """ self.log(u"Executing single level task...") try: # load audio file, extract MFCCs from real wave, clear audio file self._step_begin(u"extract MFCC real wave") real_wave_mfcc = self._extract_mfcc( file_path=self.task.audio_file_path_absolute, file_format=None) self._step_end() # compute head and/or tail and set it self._step_begin(u"compute head tail") (head_length, process_length, tail_length) = self._compute_head_process_tail(real_wave_mfcc) real_wave_mfcc.set_head_middle_tail(head_length, process_length, tail_length) self._step_end() # compute a time map alignment self._set_synthesizer() time_map = self._execute_inner(real_wave_mfcc, self.task.text_file, adjust_boundaries=True, log=True) self._clear_cache_synthesizer() # convert time_map to tree and create syncmap and add it to task self._step_begin(u"create sync map") tree = self._level_time_map_to_tree(self.task.text_file, time_map) self.task.sync_map = self._create_syncmap(tree) self._step_end() # check for fragments with zero duration self._step_begin(u"check zero duration") self._check_no_zero(self.rconf.mws) self._step_end() # log total self._step_total() self.log(u"Executing single level task... done") except Exception as exc: self._step_failure(exc) def _execute_multi_level_task(self): """ Execute a multi-level task """ self.log(u"Executing multi level task...") self.log(u"Saving rconf...") # save original rconf orig_rconf = self.rconf.clone() # clone rconfs and set granularity level_rconfs = [ None, self.rconf.clone(), self.rconf.clone(), self.rconf.clone() ] level_mfccs = [None, None, None, None] for i in range(1, len(level_rconfs)): level_rconfs[i].set_granularity(i) self.log([u"Level %d mws: %.3f", i, level_rconfs[i].mws]) self.log([u"Level %d mwl: %.3f", i, level_rconfs[i].mwl]) level_rconfs[i].set_tts(i) self.log([u"Level %d tts: %s", i, level_rconfs[i].tts]) self.log([u"Level %d tts_path: %s", i, level_rconfs[i].tts_path]) self.log(u"Saving rconf... done") try: self.log(u"Creating AudioFile object...") audio_file = self._load_audio_file() self.log(u"Creating AudioFile object... done") # extract MFCC for each level for i in range(1, len(level_rconfs)): self._step_begin(u"extract MFCC real wave level %d" % i) if (i == 1) or ( level_rconfs[i].mws != level_rconfs[i - 1].mws) or ( level_rconfs[i].mwl != level_rconfs[i - 1].mwl): self.rconf = level_rconfs[i] level_mfccs[i] = self._extract_mfcc(audio_file=audio_file) else: self.log(u"Keeping MFCC real wave from previous level") level_mfccs[i] = level_mfccs[i - 1] self._step_end() self.log(u"Clearing AudioFile object...") self.rconf = level_rconfs[1] self._clear_audio_file(audio_file) self.log(u"Clearing AudioFile object... done") # compute head tail for the entire real wave (level 1) self._step_begin(u"compute head tail") (head_length, process_length, tail_length) = self._compute_head_process_tail(level_mfccs[1]) level_mfccs[1].set_head_middle_tail(head_length, process_length, tail_length) self._step_end() # compute alignment at each level tree = Tree() sync_roots = [tree] text_files = [self.task.text_file] aht = [None, True, False, False] aba = [None, True, True, False] for i in range(1, len(level_rconfs)): self._step_begin(u"compute alignment level %d" % i) self.rconf = level_rconfs[i] text_files, sync_roots = self._execute_level( i, level_mfccs[i], text_files, sync_roots, aht[i], aba[i]) self._step_end() self._step_begin(u"select levels") tree = self._select_levels(tree) self._step_end() self._step_begin(u"create sync map") self.rconf = orig_rconf self.task.sync_map = self._create_syncmap(tree) self._step_end() self._step_begin(u"check zero duration") self._check_no_zero(level_rconfs[-1].mws) self._step_end() self._step_total() self.log(u"Executing multi level task... done") except Exception as exc: self._step_failure(exc) def _execute_level(self, level, audio_file_mfcc, text_files, sync_roots, add_head_tail, adjust_boundaries): """ Compute the alignment for all the nodes in the given level. Return a pair (next_level_text_files, next_level_sync_roots), containing two lists of text file subtrees and sync map subtrees on the next level. :param int level: the level :param audio_file_mfcc: the audio MFCC representation for this level :type audio_file_mfcc: :class:`~aeneas.audiofilemfcc.AudioFileMFCC` :param list text_files: a list of :class:`~aeneas.textfile.TextFile` objects, each representing a (sub)tree of the Task text file :param list sync_roots: a list of :class:`~aeneas.tree.Tree` objects, each representing a SyncMapFragment tree, one for each element in ``text_files`` :param bool add_head_tail: if ``True``, add head and tail nodes to the sync map tree :param bool adjust_boundaries: if ``True``, execute the adjust boundary algorithm :rtype: (list, list) """ self._set_synthesizer() next_level_text_files = [] next_level_sync_roots = [] for text_file_index, text_file in enumerate(text_files): self.log([u"Text level %d, fragment %d", level, text_file_index]) self.log([u" Len: %d", len(text_file)]) sync_root = sync_roots[text_file_index] if (level > 1) and (len(text_file) == 1): self.log( u" Level > 1 and only one child => returning trivial timemap" ) time_map = [(TimeValue("0.000"), sync_root.value.begin), (sync_root.value.begin, sync_root.value.end), (sync_root.value.end, audio_file_mfcc.audio_length) ] else: self.log( u" Level 1 or more than one child => computing timemap") if not sync_root.is_empty: begin = sync_root.value.begin end = sync_root.value.end self.log([u" Begin: %.3f", begin]) self.log([u" End: %.3f", end]) audio_file_mfcc.set_head_middle_tail(head_length=begin, middle_length=(end - begin)) else: self.log(u" No begin or end to set") time_map = self._execute_inner( audio_file_mfcc, text_file, adjust_boundaries=adjust_boundaries, log=False) self.log([u" Map: %s", str(time_map)]) self._level_time_map_to_tree(text_file, time_map, sync_root, add_head_tail=add_head_tail) # store next level roots next_level_text_files.extend(text_file.children_not_empty) src = sync_root.children if add_head_tail: # if we added head and tail, # we must not pass them to the next level src = src[1:-1] next_level_sync_roots.extend(src) self._clear_cache_synthesizer() return (next_level_text_files, next_level_sync_roots) def _execute_inner(self, audio_file_mfcc, text_file, adjust_boundaries=True, log=True): """ Align a subinterval of the given AudioFileMFCC with the given TextFile. Return the computed time map, as a list of intervals. The begin and end positions inside the AudioFileMFCC must have been set ahead by the caller. The text fragments being aligned are the vchildren of ``text_file``. :param audio_file_mfcc: the audio file MFCC representation :type audio_file_mfcc: :class:`~aeneas.audiofilemfcc.AudioFileMFCC` :param text_file: the text file subtree to align :type text_file: :class:`~aeneas.textfile.TextFile` :param bool adjust_boundaries: if ``True``, execute the adjust boundary algorithm :param bool log: if ``True``, log steps :rtype: list """ self._step_begin(u"synthesize text", log=log) synt_handler, synt_path, synt_anchors, synt_format = self._synthesize( text_file) self._step_end(log=log) self._step_begin(u"extract MFCC synt wave", log=log) synt_wave_mfcc = self._extract_mfcc(file_path=synt_path, file_format=synt_format) gf.delete_file(synt_handler, synt_path) self._step_end(log=log) self._step_begin(u"align waves", log=log) indices = self._align_waves(audio_file_mfcc, synt_wave_mfcc, synt_anchors) self._step_end(log=log) self._step_begin(u"adjust boundaries", log=log) time_map = self._adjust_boundaries(audio_file_mfcc, text_file, indices, adjust_boundaries) self._step_end(log=log) return time_map def _load_audio_file(self): """ Load audio in memory. :rtype: :class:`~aeneas.audiofile.AudioFile` """ self._step_begin(u"load audio file") # NOTE file_format=None forces conversion to # PCM16 mono WAVE with proper sample rate audio_file = AudioFile(file_path=self.task.audio_file_path_absolute, file_format=None, rconf=self.rconf, logger=self.logger) audio_file.read_samples_from_file() self._step_end() return audio_file def _clear_audio_file(self, audio_file): """ Clear audio from memory. :param audio_file: the object to clear :type audio_file: :class:`~aeneas.audiofile.AudioFile` """ self._step_begin(u"clear audio file") audio_file.clear_data() audio_file = None self._step_end() def _extract_mfcc(self, file_path=None, file_format=None, audio_file=None): """ Extract the MFCCs from the given audio file. :rtype: :class:`~aeneas.audiofilemfcc.AudioFileMFCC` """ return AudioFileMFCC(file_path=file_path, file_format=file_format, audio_file=audio_file, rconf=self.rconf, logger=self.logger) def _compute_head_process_tail(self, audio_file_mfcc): """ Set the audio file head or tail, by either reading the explicit values from the Task configuration, or using SD to determine them. This function returns the lengths, in seconds, of the (head, process, tail). :rtype: tuple (float, float, float) """ head_length = self.task.configuration["i_a_head"] process_length = self.task.configuration["i_a_process"] tail_length = self.task.configuration["i_a_tail"] head_max = self.task.configuration["i_a_head_max"] head_min = self.task.configuration["i_a_head_min"] tail_max = self.task.configuration["i_a_tail_max"] tail_min = self.task.configuration["i_a_tail_min"] if ((head_length is not None) or (process_length is not None) or (tail_length is not None)): self.log(u"Setting explicit head process tail") else: self.log(u"Detecting head tail...") sd = SD(audio_file_mfcc, self.task.text_file, rconf=self.rconf, logger=self.logger) head_length = TimeValue("0.000") process_length = None tail_length = TimeValue("0.000") if (head_min is not None) or (head_max is not None): self.log(u"Detecting HEAD...") head_length = sd.detect_head(head_min, head_max) self.log([u"Detected HEAD: %.3f", head_length]) self.log(u"Detecting HEAD... done") if (tail_min is not None) or (tail_max is not None): self.log(u"Detecting TAIL...") tail_length = sd.detect_tail(tail_min, tail_max) self.log([u"Detected TAIL: %.3f", tail_length]) self.log(u"Detecting TAIL... done") self.log(u"Detecting head tail... done") self.log([u"Head: %s", gf.safe_float(head_length, None)]) self.log([u"Process: %s", gf.safe_float(process_length, None)]) self.log([u"Tail: %s", gf.safe_float(tail_length, None)]) return (head_length, process_length, tail_length) def _set_synthesizer(self): """ Create synthesizer """ self.log(u"Setting synthesizer...") self.synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger) self.log(u"Setting synthesizer... done") def _clear_cache_synthesizer(self): """ Clear the cache of the synthesizer """ self.log(u"Clearing synthesizer...") self.synthesizer.clear_cache() self.log(u"Clearing synthesizer... done") def _synthesize(self, text_file): """ Synthesize text into a WAVE file. Return a tuple consisting of: 1. the handler of the generated audio file 2. the path of the generated audio file 3. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` 4. a tuple describing the format of the audio file :param text_file: the text to be synthesized :type text_file: :class:`~aeneas.textfile.TextFile` :rtype: tuple (handler, string, list) """ handler, path = gf.tmp_file( suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) result = self.synthesizer.synthesize(text_file, path) return (handler, path, result[0], self.synthesizer.output_audio_format) def _align_waves(self, real_wave_mfcc, synt_wave_mfcc, synt_anchors): """ Align two AudioFileMFCC objects, representing WAVE files. Return a list of boundary indices. """ self.log(u"Creating DTWAligner...") aligner = DTWAligner(real_wave_mfcc, synt_wave_mfcc, rconf=self.rconf, logger=self.logger) self.log(u"Creating DTWAligner... done") self.log(u"Computing boundary indices...") boundary_indices = aligner.compute_boundaries(synt_anchors) self.log(u"Computing boundary indices... done") return boundary_indices def _adjust_boundaries(self, real_wave_mfcc, text_file, boundary_indices, adjust_boundaries=True): """ Adjust boundaries as requested by the user. Return the computed time map, that is, a list of pairs ``[start_time, end_time]``, of length equal to number of fragments + 2, where the two extra elements are for the HEAD (first) and TAIL (last). """ # boundary_indices contains the boundary indices in the all_mfcc of real_wave_mfcc # starting with the (head-1st fragment) and ending with (-1th fragment-tail) if adjust_boundaries: aba_algorithm, aba_parameters = self.task.configuration.aba_parameters( ) self.log([u"Running algorithm: '%s'", aba_algorithm]) else: self.log(u"Forced running algorithm: 'auto'") aba_algorithm = AdjustBoundaryAlgorithm.AUTO aba_parameters = None return AdjustBoundaryAlgorithm(algorithm=aba_algorithm, parameters=aba_parameters, real_wave_mfcc=real_wave_mfcc, boundary_indices=boundary_indices, text_file=text_file, rconf=self.rconf, logger=self.logger).to_time_map() def _level_time_map_to_tree(self, text_file, time_map, tree=None, add_head_tail=True): """ Convert a level time map into a Tree of SyncMapFragments. The time map is a list of pairs ``[start_time, end_time]``, of length equal to number of fragments + 2, where the two extra elements are for the HEAD (first) and TAIL (last). :param text_file: the text file object :type text_file: :class:`~aeneas.textfile.TextFile` :param list time_map: the time map :param tree: the tree; if ``None``, a new Tree will be built :type tree: :class:`~aeneas.tree.Tree` :rtype: :class:`~aeneas.tree.Tree` """ if tree is None: tree = Tree() if add_head_tail: fragments = ([ TextFragment(u"HEAD", self.task.configuration["language"], [u""]) ] + text_file.fragments + [ TextFragment(u"TAIL", self.task.configuration["language"], [u""]) ]) i = 0 else: fragments = text_file.fragments i = 1 for fragment in fragments: interval = time_map[i] sm_frag = SyncMapFragment(fragment, interval[0], interval[1]) tree.add_child(Tree(value=sm_frag)) i += 1 return tree def _select_levels(self, tree): """ Select the correct levels in the tree, reading the ``os_task_file_levels`` parameter in the Task configuration. If ``None`` or invalid, return the current sync map tree unchanged. Otherwise, return only the levels appearing in it. :param tree: a Tree of SyncMapFragments :type tree: :class:`~aeneas.tree.Tree` :rtype: :class:`~aeneas.tree.Tree` """ levels = self.task.configuration["o_levels"] self.log([u"Levels: '%s'", levels]) if (levels is None) or (len(levels) < 1): return tree try: levels = [int(l) for l in levels if int(l) > 0] self.log([u"Converted levels: %s", levels]) except ValueError: self.log_warn( u"Cannot convert levels to list of int, returning unchanged") return tree # remove head and tail nodes head = tree.vchildren[0] tail = tree.vchildren[-1] tree.remove_child(0) tree.remove_child(-1) # keep only the selected levels tree.keep_levels(levels) # add head and tail back tree.add_child(Tree(value=head), as_last=False) tree.add_child(Tree(value=tail), as_last=True) # return the new tree return tree def _create_syncmap(self, tree): """ Return a sync map corresponding to the provided text file and time map. :param tree: a Tree of SyncMapFragments :type tree: :class:`~aeneas.tree.Tree` :rtype: :class:`~aeneas.syncmap.SyncMap` """ self.log( [u"Fragments in time map (including HEAD/TAIL): %d", len(tree)]) head_tail_format = self.task.configuration["o_h_t_format"] self.log([u"Head/tail format: %s", str(head_tail_format)]) children = tree.vchildren head = children[0] first = children[1] last = children[-2] tail = children[-1] # remove HEAD fragment if needed if head_tail_format != SyncMapHeadTailFormat.ADD: tree.remove_child(0) self.log(u"Removed HEAD") # stretch first and last fragment timings if needed if head_tail_format == SyncMapHeadTailFormat.STRETCH: self.log([ u"Stretched first.begin: %.3f => %.3f (head)", first.begin, head.begin ]) self.log([ u"Stretched last.end: %.3f => %.3f (tail)", last.end, tail.end ]) first.begin = head.begin last.end = tail.end # remove TAIL fragment if needed if head_tail_format != SyncMapHeadTailFormat.ADD: tree.remove_child(-1) self.log(u"Removed TAIL") # return sync map sync_map = SyncMap() sync_map.fragments_tree = tree return sync_map # TODO can this be done during the alignment? def _check_no_zero(self, min_mws): """ Check for fragments with zero duration """ if self.task.configuration["o_no_zero"]: self.log(u"Checking for fragments with zero duration...") delta = TimeValue("0.001") leaves = self.task.sync_map.fragments_tree.vleaves_not_empty # first and last leaves are HEAD and TAIL, skipping them max_index = len(leaves) - 1 self.log([u"Fragment min index: %d", 1]) self.log([u"Fragment max index: %d", max_index - 1]) for i in range(1, max_index): self.log([u"Checking index: %d", i]) j = i while (j < max_index) and (leaves[j].end == leaves[i].begin): j += 1 if j != i: self.log(u"Fragment(s) with zero duration:") for k in range(i, j): self.log([u" %d : %s", k, leaves[k]]) if leaves[j].end - leaves[j].begin > (j - i) * delta: # there is room after # to move each zero fragment forward by 0.001 for k in range(j - i): shift = (k + 1) * delta leaves[i + k].end += shift leaves[i + k + 1].begin += shift self.log([ u" Moved fragment %d forward by %.3f", i + k, shift ]) else: self.log_warn(u" Unable to fix") i = j - 1 self.log(u"Checking for fragments with zero duration... done") else: self.log(u"Not checking for fragments with zero duration")