Exemple #1
0
    def _synthesize(self):
        """
        Synthesize text into a ``wav`` file.

        Return a quadruple:

        1. a success bool flag
        2. handler of the generated wave file
        3. path of the generated wave file
        4. the list of anchors, that is, a list of floats
           each representing the start time of the corresponding
           text fragment in the generated wave file
           ``[start_1, start_2, ..., start_n]``
        """
        self._log("Synthesizing text")
        handler = None
        path = None
        anchors = None
        try:
            self._log("Creating an output tempfile")
            handler, path = tempfile.mkstemp(
                suffix=".wav",
                dir=gf.custom_tmp_dir()
            )
            self._log("Creating Synthesizer object")
            synt = Synthesizer(logger=self.logger)
            self._log("Synthesizing...")
            anchors = synt.synthesize(self.task.text_file, path)
            self._log("Synthesizing... done")
            self._log("Synthesizing text: succeeded")
            return (True, handler, path, anchors)
        except:
            self._log("Synthesizing text: failed")
            return (False, handler, path, anchors)
Exemple #2
0
    def _synthesize(self):
        """
        Synthesize text into a ``wav`` file.

        Return a quadruple:

        1. a success bool flag
        2. handler of the generated wave file
        3. path of the generated wave file
        4. the list of anchors, that is, a list of floats
           each representing the start time of the corresponding
           text fragment in the generated wave file
           ``[start_1, start_2, ..., start_n]``
        """
        self._log("Synthesizing text")
        handler = None
        path = None
        anchors = None
        try:
            self._log("Creating an output tempfile")
            handler, path = tempfile.mkstemp(suffix=".wav",
                                             dir=gf.custom_tmp_dir())
            self._log("Creating Synthesizer object")
            synt = Synthesizer(logger=self.logger)
            self._log("Synthesizing...")
            result = synt.synthesize(self.task.text_file, path)
            anchors = result[0]
            self._log("Synthesizing... done")
            self._log("Synthesizing text: succeeded")
            return (True, handler, path, anchors)
        except Exception as e:
            self._log("Synthesizing text: failed")
            self._log(["Message: %s", str(e)])
            return (False, handler, path, anchors)
Exemple #3
0
 def test_synthesize(self):
     handler, output_file_path = tempfile.mkstemp(suffix=".wav")
     tfl = TextFile(get_abs_path("res/inputtext/sonnet_plain.txt"), TextFileFormat.PLAIN)
     tfl.set_language(Language.EN)
     synth = Synthesizer()
     anchors = synth.synthesize(tfl, output_file_path)
     self.assertGreater(len(anchors), 0)
     os.remove(output_file_path)
Exemple #4
0
 def test_synthesize_with_unicode(self):
     handler, output_file_path = tempfile.mkstemp(suffix=".wav")
     tfl = TextFile(get_abs_path("res/inputtext/de_utf8.txt"), TextFileFormat.PARSED)
     tfl.set_language(Language.DE)
     synth = Synthesizer()
     anchors = synth.synthesize(tfl, output_file_path)
     self.assertGreater(len(anchors), 0)
     os.remove(output_file_path)
Exemple #5
0
 def perform(self, path, logger=None, quit_after=None, backwards=False):
     handler, output_file_path = tempfile.mkstemp(suffix=".wav")
     tfl = TextFile(get_abs_path(path), TextFileFormat.PLAIN)
     tfl.set_language(Language.EN)
     synth = Synthesizer(logger=logger)
     result = synth.synthesize(tfl, output_file_path, quit_after=quit_after, backwards=backwards)
     delete_file(handler, output_file_path)
     return result
 def inner(c_ext, cew_subprocess):
     handler, output_file_path = gf.tmp_file(suffix=".wav")
     tfl = TextFile(gf.absolute_path(path, __file__), TextFileFormat.PLAIN)
     tfl.set_language(Language.ENG)
     synth = Synthesizer(logger=logger)
     synth.rconf[RuntimeConfiguration.C_EXTENSIONS] = c_ext
     synth.rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED] = cew_subprocess
     result = synth.synthesize(tfl, output_file_path, quit_after=quit_after, backwards=backwards)
     gf.delete_file(handler, output_file_path)
     self.assertEqual(len(result[0]), expected)
     if expected2 is not None:
         self.assertAlmostEqual(result[1], expected2, places=0)
Exemple #7
0
 def inner(c_ext, cew_subprocess):
     handler, output_file_path = gf.tmp_file(suffix=".wav")
     tfl = TextFile(gf.absolute_path(path, __file__), TextFileFormat.PLAIN)
     tfl.set_language(Language.ENG)
     synth = Synthesizer(logger=logger)
     synth.rconf[RuntimeConfiguration.C_EXTENSIONS] = c_ext
     synth.rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED] = cew_subprocess
     result = synth.synthesize(tfl, output_file_path, quit_after=quit_after, backwards=backwards)
     gf.delete_file(handler, output_file_path)
     self.assertEqual(len(result[0]), expected)
     if expected2 is not None:
         self.assertAlmostEqual(result[1], expected2, places=0)
Exemple #8
0
    def _synthesize(self, text_file):
        """
        Synthesize text into a WAVE file.

        Return:

        1. handler of the generated wave file
        2. path of the generated wave file
        3. the list of anchors, that is, a list of floats
           each representing the start time of the corresponding
           text fragment in the generated wave file
           ``[start_1, start_2, ..., start_n]``
        4. if the synthesizer produced a PCM16 mono WAVE file

        :param synthesizer: the synthesizer to use
        :type  synthesizer: :class:`~aeneas.synthesizer.Synthesizer`
        :rtype: tuple (handler, string, list)
        """
        synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger)
        handler, path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
        result = synthesizer.synthesize(text_file, path)
        anchors = result[0]
        return (handler, path, anchors, synthesizer.output_is_mono_wave)
Exemple #9
0
    def _synthesize(self, text_file):
        """
        Synthesize text into a WAVE file.

        Return:

        1. handler of the generated wave file
        2. path of the generated wave file
        3. the list of anchors, that is, a list of floats
           each representing the start time of the corresponding
           text fragment in the generated wave file
           ``[start_1, start_2, ..., start_n]``
        4. if the synthesizer produced a PCM16 mono WAVE file

        :param synthesizer: the synthesizer to use
        :type  synthesizer: :class:`~aeneas.synthesizer.Synthesizer`
        :rtype: tuple (handler, string, list)
        """
        synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger)
        handler, path = gf.tmp_file(
            suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
        result = synthesizer.synthesize(text_file, path)
        anchors = result[0]
        return (handler, path, anchors, synthesizer.output_is_mono_wave)
 def test_clear_cache(self):
     synth = Synthesizer()
     synth.clear_cache()
 def test_synthesize_none(self):
     synth = Synthesizer()
     with self.assertRaises(TypeError):
         synth.synthesize(None, self.PATH_NOT_WRITEABLE)
Exemple #12
0
class ExecuteTask(Loggable):
    """
    Execute a task, that is, compute the sync map for it.

    :param task: the task to be executed
    :type  task: :class:`~aeneas.task.Task`
    :param rconf: a runtime configuration
    :type  rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration`
    :param logger: the logger object
    :type  logger: :class:`~aeneas.logger.Logger`
    """

    TAG = u"ExecuteTask"

    def __init__(self, task=None, rconf=None, logger=None):
        super(ExecuteTask, self).__init__(rconf=rconf, logger=logger)
        self.task = task
        self.step_index = 1
        self.step_label = u""
        self.step_begin_time = None
        self.step_total = 0.000
        self.synthesizer = None
        if task is not None:
            self.load_task(self.task)

    def load_task(self, task):
        """
        Load the task from the given ``Task`` object.

        :param task: the task to load
        :type  task: :class:`~aeneas.task.Task`
        :raises: :class:`~aeneas.executetask.ExecuteTaskInputError`: if ``task`` is not an instance of :class:`~aeneas.task.Task`
        """
        if not isinstance(task, Task):
            self.log_exc(u"task is not an instance of Task", None, True,
                         ExecuteTaskInputError)
        self.task = task

    def _step_begin(self, label, log=True):
        """ Log begin of a step """
        if log:
            self.step_label = label
            self.step_begin_time = self.log(u"STEP %d BEGIN (%s)" %
                                            (self.step_index, label))

    def _step_end(self, log=True):
        """ Log end of a step """
        if log:
            step_end_time = self.log(u"STEP %d END (%s)" %
                                     (self.step_index, self.step_label))
            diff = (step_end_time - self.step_begin_time)
            diff = float(diff.seconds + diff.microseconds / 1000000.0)
            self.step_total += diff
            self.log(u"STEP %d DURATION %.3f (%s)" %
                     (self.step_index, diff, self.step_label))
            self.step_index += 1

    def _step_failure(self, exc):
        """ Log failure of a step """
        self.log_crit(u"STEP %d (%s) FAILURE" %
                      (self.step_index, self.step_label))
        self.step_index += 1
        self.log_exc(u"Unexpected error while executing task", exc, True,
                     ExecuteTaskExecutionError)

    def _step_total(self):
        """ Log total """
        self.log(u"STEP T DURATION %.3f" % (self.step_total))

    def execute(self):
        """
        Execute the task.
        The sync map produced will be stored inside the task object.

        :raises: :class:`~aeneas.executetask.ExecuteTaskInputError`: if there is a problem with the input parameters
        :raises: :class:`~aeneas.executetask.ExecuteTaskExecutionError`: if there is a problem during the task execution
        """
        self.log(u"Executing task...")

        # check that we have the AudioFile object
        if self.task.audio_file is None:
            self.log_exc(u"The task does not seem to have its audio file set",
                         None, True, ExecuteTaskInputError)
        if ((self.task.audio_file.audio_length is None)
                or (self.task.audio_file.audio_length <= 0)):
            self.log_exc(u"The task seems to have an invalid audio file", None,
                         True, ExecuteTaskInputError)
        task_max_audio_length = self.rconf[
            RuntimeConfiguration.TASK_MAX_AUDIO_LENGTH]
        if ((task_max_audio_length > 0) and
            (self.task.audio_file.audio_length > task_max_audio_length)):
            self.log_exc(
                u"The audio file of the task has length %.3f, more than the maximum allowed (%.3f)."
                % (self.task.audio_file.audio_length, task_max_audio_length),
                None, True, ExecuteTaskInputError)

        # check that we have the TextFile object
        if self.task.text_file is None:
            self.log_exc(u"The task does not seem to have its text file set",
                         None, True, ExecuteTaskInputError)
        if len(self.task.text_file) == 0:
            self.log_exc(u"The task text file seems to have no text fragments",
                         None, True, ExecuteTaskInputError)
        task_max_text_length = self.rconf[
            RuntimeConfiguration.TASK_MAX_TEXT_LENGTH]
        if ((task_max_text_length > 0)
                and (len(self.task.text_file) > task_max_text_length)):
            self.log_exc(
                u"The text file of the task has %d fragments, more than the maximum allowed (%d)."
                % (len(self.task.text_file), task_max_text_length), None, True,
                ExecuteTaskInputError)
        if self.task.text_file.chars == 0:
            self.log_exc(u"The task text file seems to have empty text", None,
                         True, ExecuteTaskInputError)

        self.log(u"Both audio and text input file are present")

        # execute
        self.step_index = 1
        self.step_total = 0.000
        if self.task.text_file.file_format in TextFileFormat.MULTILEVEL_VALUES:
            self._execute_multi_level_task()
        else:
            self._execute_single_level_task()
        self.log(u"Executing task... done")

    def _execute_single_level_task(self):
        """ Execute a single-level task """
        self.log(u"Executing single level task...")
        try:
            # load audio file, extract MFCCs from real wave, clear audio file
            self._step_begin(u"extract MFCC real wave")
            real_wave_mfcc = self._extract_mfcc(
                file_path=self.task.audio_file_path_absolute,
                file_format=None,
            )
            self._step_end()

            # compute head and/or tail and set it
            self._step_begin(u"compute head tail")
            (head_length, process_length,
             tail_length) = self._compute_head_process_tail(real_wave_mfcc)
            real_wave_mfcc.set_head_middle_tail(head_length, process_length,
                                                tail_length)
            self._step_end()

            # compute alignment, outputting a tree of time intervals
            self._set_synthesizer()
            sync_root = Tree()
            self._execute_inner(real_wave_mfcc,
                                self.task.text_file,
                                sync_root=sync_root,
                                force_aba_auto=False,
                                log=True,
                                leaf_level=True)
            self._clear_cache_synthesizer()

            # create syncmap and add it to task
            self._step_begin(u"create sync map")
            self._create_sync_map(sync_root=sync_root)
            self._step_end()

            # log total
            self._step_total()
            self.log(u"Executing single level task... done")
        except Exception as exc:
            self._step_failure(exc)

    def _execute_multi_level_task(self):
        """ Execute a multi-level task """
        self.log(u"Executing multi level task...")

        self.log(u"Saving rconf...")
        # save original rconf
        orig_rconf = self.rconf.clone()
        # clone rconfs and set granularity
        # TODO the following code assumes 3 levels: generalize this
        level_rconfs = [
            None,
            self.rconf.clone(),
            self.rconf.clone(),
            self.rconf.clone()
        ]
        level_mfccs = [None, None, None, None]
        force_aba_autos = [None, False, False, True]
        for i in range(1, len(level_rconfs)):
            level_rconfs[i].set_granularity(i)
            self.log([u"Level %d mmn: %s", i, level_rconfs[i].mmn])
            self.log([u"Level %d mwl: %.3f", i, level_rconfs[i].mwl])
            self.log([u"Level %d mws: %.3f", i, level_rconfs[i].mws])
            level_rconfs[i].set_tts(i)
            self.log([u"Level %d tts: %s", i, level_rconfs[i].tts])
            self.log([u"Level %d tts_path: %s", i, level_rconfs[i].tts_path])
        self.log(u"Saving rconf... done")
        try:
            self.log(u"Creating AudioFile object...")
            audio_file = self._load_audio_file()
            self.log(u"Creating AudioFile object... done")

            # extract MFCC for each level
            for i in range(1, len(level_rconfs)):
                self._step_begin(u"extract MFCC real wave level %d" % i)
                if (i == 1) or (
                        level_rconfs[i].mws != level_rconfs[i - 1].mws) or (
                            level_rconfs[i].mwl != level_rconfs[i - 1].mwl):
                    self.rconf = level_rconfs[i]
                    level_mfccs[i] = self._extract_mfcc(audio_file=audio_file)
                else:
                    self.log(u"Keeping MFCC real wave from previous level")
                    level_mfccs[i] = level_mfccs[i - 1]
                self._step_end()

            self.log(u"Clearing AudioFile object...")
            self.rconf = level_rconfs[1]
            self._clear_audio_file(audio_file)
            self.log(u"Clearing AudioFile object... done")

            # compute head tail for the entire real wave (level 1)
            self._step_begin(u"compute head tail")
            (head_length, process_length,
             tail_length) = self._compute_head_process_tail(level_mfccs[1])
            level_mfccs[1].set_head_middle_tail(head_length, process_length,
                                                tail_length)
            self._step_end()

            # compute alignment at each level
            sync_root = Tree()
            sync_roots = [sync_root]
            text_files = [self.task.text_file]
            number_levels = len(level_rconfs)
            for i in range(1, number_levels):
                self._step_begin(u"compute alignment level %d" % i)
                self.rconf = level_rconfs[i]
                text_files, sync_roots = self._execute_level(
                    level=i,
                    audio_file_mfcc=level_mfccs[i],
                    text_files=text_files,
                    sync_roots=sync_roots,
                    force_aba_auto=force_aba_autos[i],
                )
                self._step_end()

            # restore original rconf, and create syncmap and add it to task
            self._step_begin(u"create sync map")
            self.rconf = orig_rconf
            self._create_sync_map(sync_root=sync_root)
            self._step_end()

            self._step_total()
            self.log(u"Executing multi level task... done")
        except Exception as exc:
            self._step_failure(exc)

    def _execute_level(self,
                       level,
                       audio_file_mfcc,
                       text_files,
                       sync_roots,
                       force_aba_auto=False):
        """
        Compute the alignment for all the nodes in the given level.

        Return a pair (next_level_text_files, next_level_sync_roots),
        containing two lists of text file subtrees and sync map subtrees
        on the next level.

        :param int level: the level
        :param audio_file_mfcc: the audio MFCC representation for this level
        :type  audio_file_mfcc: :class:`~aeneas.audiofilemfcc.AudioFileMFCC`
        :param list text_files: a list of :class:`~aeneas.textfile.TextFile` objects,
                                each representing a (sub)tree of the Task text file
        :param list sync_roots: a list of :class:`~aeneas.tree.Tree` objects,
                                each representing a SyncMapFragment tree,
                                one for each element in ``text_files``
        :param bool force_aba_auto: if ``True``, force using the AUTO ABA algorithm
        :rtype: (list, list)
        """
        self._set_synthesizer()
        next_level_text_files = []
        next_level_sync_roots = []
        for text_file_index, text_file in enumerate(text_files):
            self.log([u"Text level %d, fragment %d", level, text_file_index])
            self.log([u"  Len:   %d", len(text_file)])
            sync_root = sync_roots[text_file_index]
            if (level > 1) and (len(text_file) == 1):
                self.log(
                    u"Level > 1 and only one text fragment => return trivial tree"
                )
                self._append_trivial_tree(text_file, sync_root)
            elif (level > 1) and (sync_root.value.begin
                                  == sync_root.value.end):
                self.log(
                    u"Level > 1 and parent has begin == end => return trivial tree"
                )
                self._append_trivial_tree(text_file, sync_root)
            else:
                self.log(
                    u"Level == 1 or more than one text fragment with non-zero parent => compute tree"
                )
                if not sync_root.is_empty:
                    begin = sync_root.value.begin
                    end = sync_root.value.end
                    self.log([u"  Setting begin: %.3f", begin])
                    self.log([u"  Setting end:   %.3f", end])
                    audio_file_mfcc.set_head_middle_tail(head_length=begin,
                                                         middle_length=(end -
                                                                        begin))
                else:
                    self.log(u"  No begin or end to set")
                self._execute_inner(audio_file_mfcc,
                                    text_file,
                                    sync_root=sync_root,
                                    force_aba_auto=force_aba_auto,
                                    log=False,
                                    leaf_level=(level == 3))
            # store next level roots
            next_level_text_files.extend(text_file.children_not_empty)
            # we added head and tail, we must not pass them to the next level
            next_level_sync_roots.extend(sync_root.children[1:-1])
        self._clear_cache_synthesizer()
        return (next_level_text_files, next_level_sync_roots)

    def _execute_inner(self,
                       audio_file_mfcc,
                       text_file,
                       sync_root=None,
                       force_aba_auto=False,
                       log=True,
                       leaf_level=False):
        """
        Align a subinterval of the given AudioFileMFCC
        with the given TextFile.

        Return the computed tree of time intervals,
        rooted at ``sync_root`` if the latter is not ``None``,
        or as a new ``Tree`` otherwise.

        The begin and end positions inside the AudioFileMFCC
        must have been set ahead by the caller.

        The text fragments being aligned are the vchildren of ``text_file``.

        :param audio_file_mfcc: the audio file MFCC representation
        :type  audio_file_mfcc: :class:`~aeneas.audiofilemfcc.AudioFileMFCC`
        :param text_file: the text file subtree to align
        :type  text_file: :class:`~aeneas.textfile.TextFile`
        :param sync_root: the tree node to which fragments should be appended
        :type  sync_root: :class:`~aeneas.tree.Tree`
        :param bool force_aba_auto: if ``True``, do not run aba algorithm
        :param bool log: if ``True``, log steps
        :param bool leaf_level: alert aba if the computation is at a leaf level
        :rtype: :class:`~aeneas.tree.Tree`
        """
        if 'timings' in text_file.file_path:
            self._step_begin(u"extract timings", log=log)
            synt_path, synt_anchors, synt_format = self._provide_times(
                text_file)
            self._step_end(log=log)

            self._step_begin(u"extract MFCC synt wave", log=log)
            synt_wave_mfcc = self._extract_mfcc(
                file_path=synt_path,
                file_format=synt_format,
            )
            # gf.delete_file(synt_handler, synt_path)
            self._step_end(log=log)

        else:
            self._step_begin(u"synthesize text", log=log)
            func = '_time_and_combine' if 'clips' in text_file.file_path else '_synthesize'
            synt_handler, synt_path, synt_anchors, synt_format = getattr(
                self, func)(text_file)
            self._step_end(log=log)

            self._step_begin(u"extract MFCC synt wave", log=log)
            synt_wave_mfcc = self._extract_mfcc(
                file_path=synt_path,
                file_format=synt_format,
            )
            gf.delete_file(synt_handler, synt_path)
            self._step_end(log=log)

        self._step_begin(u"align waves", log=log)
        indices = self._align_waves(audio_file_mfcc, synt_wave_mfcc,
                                    synt_anchors)
        self._step_end(log=log)

        self._step_begin(u"adjust boundaries", log=log)
        self._adjust_boundaries(indices, text_file, audio_file_mfcc, sync_root,
                                force_aba_auto, leaf_level)
        self._step_end(log=log)

    def _provide_times(self, text_file):
        with open(text_file.file_path) as file:
            timings = [row.strip().split(',') for row in file.readlines()]
            synt_anchors = [[TimeValue(start), verse, file]
                            for verse, start, file in timings]

        synt_wav = timings[0][-1]
        synt_format = ('pcm_s161e', 1, 2)
        return synt_wav, synt_anchors, synt_format

    def _time_and_combine(self, text_file):
        """
        Combine original audio clips into a single WAV file.

        Return a tuple consisting of:

        1. the handler of the generated audio file
        2. the path of the generated audio file
        3. the list of anchors, that is, a list of floats
           each representing the start time of the corresponding
           text fragment in the generated wave file
           ``[start_1, start_2, ..., start_n]``
        4. a tuple describing the format of the audio file

        :param text_file: the text with audio clips to be timed/combined
        :type  text_file: :class:`~aeneas.textfile.TextFile`
        :rtype: tuple (handler, string, list)
        """
        import subprocess

        # Concatenate all clips into a single, temporary file
        handler, path = gf.tmp_file(
            suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
        cmd = "ffmpeg -y -f concat -i {} -c copy {}".format(
            text_file.file_path, path)
        subprocess.call(cmd, shell=True)

        audio_format = ('pcm_s161e', 1, 2)

        # Build "synt" anchor times
        anchor_time, anchors = TimeValue('0.0'), []
        for fragment in text_file.fragments:
            audio_path = 'output/sample/{}'.format(fragment.text.split("'")[1])
            audio_file = AudioFileMFCC(file_path=audio_path,
                                       file_format=audio_format)
            # TODO: Investigate faster ways to get the audio_length
            # cmd = 'ffprobe -i {} -show_entries format=duration -v quiet -of csv="p=0"'.format(audio_path)
            # subprocess.call(cmd, shell=True)
            # # should become... (to get response)
            # cmds = ['ffprobe', '-i', audio_path, '-show_entries', 'format=duration',
            #         '-v', 'quiet', '-of', 'csv="p=0"']
            # p = subprocess.Popen(cmds, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            # output, err = p.communicate()
            # audio_length = TimeValue(output)
            anchors.append([anchor_time, fragment.identifier, audio_path])
            anchor_time += audio_file.audio_length

        # [
        #     [TimeValue('0.0'), u'f000001', 'output/sample/audio01.wav'],
        #     [TimeValue('0.339625'), u'f000002', 'output/sample/audio02.wav'],
        #     [TimeValue('3.5526875'), u'f000003', 'output/sample/audio03.wav'],
        #     [TimeValue('6.6874375'), u'f000004', 'output/sample/audio04.wav'],
        #     [TimeValue('9.5609375'), u'f000005', 'output/sample/audio05.wav'],
        #     [TimeValue('12.4344375'), u'f000006', 'output/sample/audio06.wav'],
        #     [TimeValue('16.1961250'), u'f000007', 'output/sample/audio07.wav'],
        #     [TimeValue('19.9578125'), u'f000008', 'output/sample/audio08.wav'],
        #     [TimeValue('23.0925625'), u'f000009', 'output/sample/audio09.wav'],
        #     [TimeValue('28.0297500'), u'f000010', 'output/sample/audio10.wav'],
        #     [TimeValue('31.1645000'), u'f000011', 'output/sample/audio11.wav'],
        #     [TimeValue('33.5678125'), u'f000012', 'output/sample/audio12.wav'],
        #     [TimeValue('37.0943750'), u'f000013', 'output/sample/audio13.wav'],
        #     [TimeValue('40.2030000'), u'f000014', 'output/sample/audio14.wav'],
        #     [TimeValue('43.8601875'), u'f000015', 'output/sample/audio15.wav']
        # ]

        # import pdb; pdb.set_trace()

        return (handler, path, anchors, audio_format)

    def _load_audio_file(self):
        """
        Load audio in memory.

        :rtype: :class:`~aeneas.audiofile.AudioFile`
        """
        self._step_begin(u"load audio file")
        # NOTE file_format=None forces conversion to
        #      PCM16 mono WAVE with default sample rate
        audio_file = AudioFile(file_path=self.task.audio_file_path_absolute,
                               file_format=None,
                               rconf=self.rconf,
                               logger=self.logger)
        audio_file.read_samples_from_file()
        self._step_end()
        return audio_file

    def _clear_audio_file(self, audio_file):
        """
        Clear audio from memory.

        :param audio_file: the object to clear
        :type  audio_file: :class:`~aeneas.audiofile.AudioFile`
        """
        self._step_begin(u"clear audio file")
        audio_file.clear_data()
        audio_file = None
        self._step_end()

    def _extract_mfcc(self, file_path=None, file_format=None, audio_file=None):
        """
        Extract the MFCCs from the given audio file.

        :rtype: :class:`~aeneas.audiofilemfcc.AudioFileMFCC`
        """
        audio_file_mfcc = AudioFileMFCC(file_path=file_path,
                                        file_format=file_format,
                                        audio_file=audio_file,
                                        rconf=self.rconf,
                                        logger=self.logger)
        if self.rconf.mmn:
            self.log(u"Running VAD inside _extract_mfcc...")
            audio_file_mfcc.run_vad(
                log_energy_threshold=self.rconf[
                    RuntimeConfiguration.MFCC_MASK_LOG_ENERGY_THRESHOLD],
                min_nonspeech_length=self.rconf[
                    RuntimeConfiguration.MFCC_MASK_MIN_NONSPEECH_LENGTH],
                extend_before=self.rconf[
                    RuntimeConfiguration.
                    MFCC_MASK_EXTEND_SPEECH_INTERVAL_BEFORE],
                extend_after=self.rconf[
                    RuntimeConfiguration.
                    MFCC_MASK_EXTEND_SPEECH_INTERVAL_AFTER])
            self.log(u"Running VAD inside _extract_mfcc... done")
        return audio_file_mfcc

    def _compute_head_process_tail(self, audio_file_mfcc):
        """
        Set the audio file head or tail,
        by either reading the explicit values
        from the Task configuration,
        or using SD to determine them.

        This function returns the lengths, in seconds,
        of the (head, process, tail).

        :rtype: tuple (float, float, float)
        """
        head_length = self.task.configuration["i_a_head"]
        process_length = self.task.configuration["i_a_process"]
        tail_length = self.task.configuration["i_a_tail"]
        head_max = self.task.configuration["i_a_head_max"]
        head_min = self.task.configuration["i_a_head_min"]
        tail_max = self.task.configuration["i_a_tail_max"]
        tail_min = self.task.configuration["i_a_tail_min"]
        if ((head_length is not None) or (process_length is not None)
                or (tail_length is not None)):
            self.log(u"Setting explicit head process tail")
        else:
            self.log(u"Detecting head tail...")
            sd = SD(audio_file_mfcc,
                    self.task.text_file,
                    rconf=self.rconf,
                    logger=self.logger)
            head_length = TimeValue("0.000")
            process_length = None
            tail_length = TimeValue("0.000")
            if (head_min is not None) or (head_max is not None):
                self.log(u"Detecting HEAD...")
                head_length = sd.detect_head(head_min, head_max)
                self.log([u"Detected HEAD: %.3f", head_length])
                self.log(u"Detecting HEAD... done")
            if (tail_min is not None) or (tail_max is not None):
                self.log(u"Detecting TAIL...")
                tail_length = sd.detect_tail(tail_min, tail_max)
                self.log([u"Detected TAIL: %.3f", tail_length])
                self.log(u"Detecting TAIL... done")
            self.log(u"Detecting head tail... done")
        self.log([u"Head:    %s", gf.safe_float(head_length, None)])
        self.log([u"Process: %s", gf.safe_float(process_length, None)])
        self.log([u"Tail:    %s", gf.safe_float(tail_length, None)])
        return (head_length, process_length, tail_length)

    def _set_synthesizer(self):
        """ Create synthesizer """
        self.log(u"Setting synthesizer...")
        self.synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger)
        self.log(u"Setting synthesizer... done")

    def _clear_cache_synthesizer(self):
        """ Clear the cache of the synthesizer """
        self.log(u"Clearing synthesizer...")
        self.synthesizer.clear_cache()
        self.log(u"Clearing synthesizer... done")

    def _synthesize(self, text_file):
        """
        Synthesize text into a WAVE file.

        Return a tuple consisting of:

        1. the handler of the generated audio file
        2. the path of the generated audio file
        3. the list of anchors, that is, a list of floats
           each representing the start time of the corresponding
           text fragment in the generated wave file
           ``[start_1, start_2, ..., start_n]``
        4. a tuple describing the format of the audio file

        :param text_file: the text to be synthesized
        :type  text_file: :class:`~aeneas.textfile.TextFile`
        :rtype: tuple (handler, string, list)
        """
        handler, path = gf.tmp_file(
            suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
        result = self.synthesizer.synthesize(text_file, path)
        return (handler, path, result[0], self.synthesizer.output_audio_format)

    def _align_waves(self, real_wave_mfcc, synt_wave_mfcc, synt_anchors):
        """
        Align two AudioFileMFCC objects,
        representing WAVE files.

        Return a list of boundary indices.
        """
        self.log(u"Creating DTWAligner...")
        aligner = DTWAligner(real_wave_mfcc,
                             synt_wave_mfcc,
                             rconf=self.rconf,
                             logger=self.logger)
        self.log(u"Creating DTWAligner... done")
        self.log(u"Computing boundary indices...")
        boundary_indices = aligner.compute_boundaries(synt_anchors)
        self.log(u"Computing boundary indices... done")
        return boundary_indices

    def _adjust_boundaries(self,
                           boundary_indices,
                           text_file,
                           real_wave_mfcc,
                           sync_root,
                           force_aba_auto=False,
                           leaf_level=False):
        """
        Adjust boundaries as requested by the user.

        Return the computed time map, that is,
        a list of pairs ``[start_time, end_time]``,
        of length equal to number of fragments + 2,
        where the two extra elements are for
        the HEAD (first) and TAIL (last).
        """
        # boundary_indices contains the boundary indices in the all_mfcc of real_wave_mfcc
        # starting with the (head-1st fragment) and ending with (-1th fragment-tail)
        aba_parameters = self.task.configuration.aba_parameters()
        if force_aba_auto:
            self.log(u"Forced running algorithm: 'auto'")
            aba_parameters["algorithm"] = (AdjustBoundaryAlgorithm.AUTO, [])
            # note that the other aba settings (nonspeech and nozero)
            # remain as specified by the user
        self.log([u"ABA parameters: %s", aba_parameters])
        aba = AdjustBoundaryAlgorithm(rconf=self.rconf, logger=self.logger)
        aba.adjust(aba_parameters=aba_parameters,
                   real_wave_mfcc=real_wave_mfcc,
                   boundary_indices=boundary_indices,
                   text_file=text_file,
                   allow_arbitrary_shift=leaf_level)
        aba.append_fragment_list_to_sync_root(sync_root=sync_root)

    def _append_trivial_tree(self, text_file, sync_root):
        """
        Append trivial tree, made by one HEAD,
        one sync map fragment for each element of ``text_file``,
        and one TAIL.

        This function is called if either ``text_file`` has only one element,
        or if ``sync_root.value`` is an interval with zero length
        (i.e., ``sync_root.value.begin == sync_root.value.end``).
        """
        interval = sync_root.value
        #
        # NOTE the following is correct, but it is a bit obscure
        # time_values = [interval.begin] * (1 + len(text_file)) + [interval.end] * 2
        #
        if len(text_file) == 1:
            time_values = [
                interval.begin, interval.begin, interval.end, interval.end
            ]
        else:
            # interval.begin == interval.end
            time_values = [interval.begin] * (3 + len(text_file))
        aba = AdjustBoundaryAlgorithm(rconf=self.rconf, logger=self.logger)
        aba.intervals_to_fragment_list(text_file=text_file,
                                       time_values=time_values)
        aba.append_fragment_list_to_sync_root(sync_root=sync_root)

    def _create_sync_map(self, sync_root):
        """
        If requested, check that the computed sync map is consistent.
        Then, add it to the Task.
        """
        sync_map = SyncMap(tree=sync_root,
                           rconf=self.rconf,
                           logger=self.logger)
        if self.rconf.safety_checks:
            self.log(u"Running sanity check on computed sync map...")
            if not sync_map.leaves_are_consistent:
                self._step_failure(
                    ValueError(
                        u"The computed sync map contains inconsistent fragments"
                    ))
            self.log(u"Running sanity check on computed sync map... passed")
        else:
            self.log(u"Not running sanity check on computed sync map")
        self.task.sync_map = sync_map
Exemple #13
0
def build_sync_map(
    text_paths, audio_paths, tmp_dir,
    sync_map_text_path_prefix, sync_map_audio_path_prefix,
    skip_penalty, radius
):
    """
    This is an algorithm for building a sync map.
    It synthesizes text and then aligns synthesized audio with the recorded audio
    using a variation of the DTW (Dynamic Time Warping) algorithm.
    
    The main features of this algorithm are:
    1) It can handle structural differences in the beginning and in the end of files.
    2) It finds an approximation to an optimal warping path in linear time and space using FastDTW approach.

    Note that while the algorithm does not require one-to-one correspondance
    between text and audio files (i.e. the splitting can be done differently),
    the quality of the result is sensitive to the choice of skip_penalty and radius parameters,
    so it is recommended to have such a correspondance.

    Alignment details:
    Synthesized and recorded audio are represented as sequences of MFCC frames.
    These sequences are aligned using variation of the DTW algorithm.
    In contrast to the classic DTW, this algorithms can be used
    to align sequences with structural differences in the beginning or in the end.
    
    Steps to build a sync map:
    1) Synthesize text file and produce a list of anchors.
    Each anchor represents the start of the corresponding text fragment in a synthesized audio.
    2) Get sequences of MFCC frames of synthesized and recorded audio.
    3) Get their warping path by calling the alignment algorithm.
    4) Check whether the extra content is found, calculate mapping boundaries.
    5) Map anchors inside the boundaries to the recorded MFCC sequence using warping path from step 3.
    6) Start all over again considering:
    If there is an extra content in the end of synthesized sequence, align it with the next audio file.
    If there is an extra content in the end of recorded sequence, align it with the next text file.
    If both sequences have extra content in the end, align text tail with the next audio file.
    If none of the above, align next text and audio files.
    """

    synthesizer = Synthesizer()
    parse_parameters = {'is_text_unparsed_id_regex': 'f[0-9]+'}
    
    sync_map = {}
    process_next_text = True
    process_next_audio = True

    while True:
        if process_next_text:
            try:
                text_path = next(text_paths)
            except StopIteration:
                break

            text_name = get_name_from_path(text_path)
            output_text_name = os.path.join(sync_map_text_path_prefix, text_name)
            textfile = TextFile(text_path, file_format=TextFileFormat.UNPARSED, parameters=parse_parameters)
            textfile.set_language(Language.ENG)
            text_wav_path = os.path.join(tmp_dir, f'{drop_extension(text_name)}_text.wav')
            sync_map[output_text_name] = {}

            # Produce synthesized audio, get anchors
            anchors,_,_ = synthesizer.synthesize(textfile, text_wav_path)
            
            # Get fragments, convert anchors timings to the frames indicies
            fragments = [a[1] for a in anchors]
            anchors = np.array([int(a[0] / TimeValue('0.040')) for a in anchors])

            # MFCC frames sequence memory layout is a n x l 2D array,
            # where n - number of frames and l - number of MFFCs
            # i.e it is c-contiguous, but after dropping the first coefficient it siezes to be c-contiguous.
            # Should decide whether to make a copy or to work around the first coefficient.
            text_mfcc_sequence = np.ascontiguousarray(
                AudioFileMFCC(text_wav_path).all_mfcc.T[:, 1:]
            )
            
        if process_next_audio:
            try:
                audio_path = next(audio_paths)
            except StopIteration:
                break

            audio_name = get_name_from_path(audio_path)
            output_audio_name = os.path.join(sync_map_audio_path_prefix, audio_name)
            audio_wav_path = os.path.join(tmp_dir, f'{drop_extension(audio_name)}_audio.wav')
            subprocess.run(['ffmpeg', '-n', '-i', audio_path, audio_wav_path])

            audio_mfcc_sequence = np.ascontiguousarray(
                AudioFileMFCC(audio_wav_path).all_mfcc.T[:, 1:]
            )
            
            # Keep track to calculate frames timings
            audio_start_frame = 0
        
        n = len(text_mfcc_sequence)
        m = len(audio_mfcc_sequence)

        _, path = c_FastDTWBD(text_mfcc_sequence, audio_mfcc_sequence, skip_penalty, radius=radius)
        
        if len(path) == 0:
            print(
                f'No match between {text_name} and {audio_name}. '
                f'Alignment is terminated. '
                f'Adjust skip_penalty or input files.'
            )
            return {}
        
        # Project path to the text and audio sequences
        text_path_frames = path[:,0]
        audio_path_frames = path[:,1]
        
        last_matched_audio_frame = audio_path_frames[-1]

        # Find first and last matched frames
        first_matched_text_frame = text_path_frames[0]
        last_matched_text_frame = text_path_frames[-1]

        # Map only those fragments that intersect matched frames
        anchors_boundary_indices = np.searchsorted(
            anchors, [first_matched_text_frame, last_matched_text_frame]
        )
        map_anchors_from = max(anchors_boundary_indices[0] - 1, 0)
        map_anchors_to = anchors_boundary_indices[1]
        anchors_to_map = anchors[map_anchors_from:map_anchors_to]
        fragments_to_map = fragments[map_anchors_from:map_anchors_to]

        # Get anchors indicies in the path projection to the text sequence
        text_path_anchor_indices = np.searchsorted(text_path_frames, anchors_to_map)
        
        # Get anchors' frames in audio sequence, calculate their timings
        anchors_matched_frames = audio_path_frames[text_path_anchor_indices]
        timings = (np.append(anchors_matched_frames, audio_path_frames[-1]) + audio_start_frame) * 0.040
        
        # Map fragment_ids to timings, update mapping of the current text file
        fragment_map = {
            f: {
                'audio_file': output_audio_name,
                'begin_time': time_to_str(bt),
                'end_time': time_to_str(et)
            }
            for f, bt, et in zip(fragments_to_map, timings[:-1], timings[1:])
        }

        sync_map[output_text_name].update(fragment_map)
        
        # Decide whether to process next file or to align the tail of the current one

        if map_anchors_to == len(anchors):
            # Process next text if no fragments are left
            process_next_text = True
        else:
            # Otherwise align tail of the current text
            process_next_text = False
            text_mfcc_sequence = text_mfcc_sequence[last_matched_text_frame:]
            fragments = fragments[map_anchors_to:]
            anchors = anchors[map_anchors_to:] - last_matched_text_frame
            
        if last_matched_audio_frame == m - 1 or not process_next_text:
            # Process next audio if there are no unmatched audio frames in the tail
            # or there are more text fragments to map, i.e.
            # we choose to process next audio if we cannot decide.
            # This strategy is correct if there are no extra fragments in the end.
            process_next_audio = True
        else:
            # Otherwise align tail of the current audio
            process_next_audio = False
            audio_mfcc_sequence = audio_mfcc_sequence[last_matched_audio_frame:]
            audio_start_frame += last_matched_audio_frame
    
    return sync_map
Exemple #14
0
 def test_synthesize_none(self):
     synth = Synthesizer()
     with self.assertRaises(TypeError):
         synth.synthesize(None, self.PATH_NOT_WRITEABLE)
Exemple #15
0
 def _set_synthesizer(self):
     """ Create synthesizer """
     self.log(u"Setting synthesizer...")
     self.synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger)
     self.log(u"Setting synthesizer... done")
Exemple #16
0
class ExecuteTask(Loggable):
    """
    Execute a task, that is, compute the sync map for it.

    :param task: the task to be executed
    :type  task: :class:`~aeneas.task.Task`
    :param rconf: a runtime configuration
    :type  rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration`
    :param logger: the logger object
    :type  logger: :class:`~aeneas.logger.Logger`
    """

    TAG = u"ExecuteTask"

    def __init__(self, task=None, rconf=None, logger=None):
        super(ExecuteTask, self).__init__(rconf=rconf, logger=logger)
        self.task = task
        self.step_index = 1
        self.step_label = u""
        self.step_begin_time = None
        self.step_total = 0.000
        self.synthesizer = None
        if task is not None:
            self.load_task(self.task)

    def load_task(self, task):
        """
        Load the task from the given ``Task`` object.

        :param task: the task to load
        :type  task: :class:`~aeneas.task.Task`
        :raises: :class:`~aeneas.executetask.ExecuteTaskInputError`: if ``task`` is not an instance of :class:`~aeneas.task.Task`
        """
        if not isinstance(task, Task):
            self.log_exc(u"task is not an instance of Task", None, True, ExecuteTaskInputError)
        self.task = task

    def _step_begin(self, label, log=True):
        """ Log begin of a step """
        if log:
            self.step_label = label
            self.step_begin_time = self.log(u"STEP %d BEGIN (%s)" % (self.step_index, label))

    def _step_end(self, log=True):
        """ Log end of a step """
        if log:
            step_end_time = self.log(u"STEP %d END (%s)" % (self.step_index, self.step_label))
            diff = (step_end_time - self.step_begin_time)
            diff = float(diff.seconds + diff.microseconds / 1000000.0)
            self.step_total += diff
            self.log(u"STEP %d DURATION %.3f (%s)" % (self.step_index, diff, self.step_label))
            self.step_index += 1

    def _step_failure(self, exc):
        """ Log failure of a step """
        self.log_crit(u"STEP %d (%s) FAILURE" % (self.step_index, self.step_label))
        self.step_index += 1
        self.log_exc(u"Unexpected error while executing task", exc, True, ExecuteTaskExecutionError)

    def _step_total(self):
        """ Log total """
        self.log(u"STEP T DURATION %.3f" % (self.step_total))

    def execute(self):
        """
        Execute the task.
        The sync map produced will be stored inside the task object.

        :raises: :class:`~aeneas.executetask.ExecuteTaskInputError`: if there is a problem with the input parameters
        :raises: :class:`~aeneas.executetask.ExecuteTaskExecutionError`: if there is a problem during the task execution
        """
        self.log(u"Executing task...")

        # check that we have the AudioFile object
        if self.task.audio_file is None:
            self.log_exc(u"The task does not seem to have its audio file set", None, True, ExecuteTaskInputError)
        if (
                (self.task.audio_file.audio_length is None) or
                (self.task.audio_file.audio_length <= 0)
        ):
            self.log_exc(u"The task seems to have an invalid audio file", None, True, ExecuteTaskInputError)
        task_max_audio_length = self.rconf[RuntimeConfiguration.TASK_MAX_AUDIO_LENGTH]
        if (
                (task_max_audio_length > 0) and
                (self.task.audio_file.audio_length > task_max_audio_length)
        ):
            self.log_exc(u"The audio file of the task has length %.3f, more than the maximum allowed (%.3f)." % (self.task.audio_file.audio_length, task_max_audio_length), None, True, ExecuteTaskInputError)

        # check that we have the TextFile object
        if self.task.text_file is None:
            self.log_exc(u"The task does not seem to have its text file set", None, True, ExecuteTaskInputError)
        if len(self.task.text_file) == 0:
            self.log_exc(u"The task text file seems to have no text fragments", None, True, ExecuteTaskInputError)
        task_max_text_length = self.rconf[RuntimeConfiguration.TASK_MAX_TEXT_LENGTH]
        if (
                (task_max_text_length > 0) and
                (len(self.task.text_file) > task_max_text_length)
        ):
            self.log_exc(u"The text file of the task has %d fragments, more than the maximum allowed (%d)." % (len(self.task.text_file), task_max_text_length), None, True, ExecuteTaskInputError)
        if self.task.text_file.chars == 0:
            self.log_exc(u"The task text file seems to have empty text", None, True, ExecuteTaskInputError)

        self.log(u"Both audio and text input file are present")

        # execute
        self.step_index = 1
        self.step_total = 0.000
        if self.task.text_file.file_format in TextFileFormat.MULTILEVEL_VALUES:
            self._execute_multi_level_task()
        else:
            self._execute_single_level_task()
        self.log(u"Executing task... done")

    def _execute_single_level_task(self):
        """ Execute a single-level task """
        self.log(u"Executing single level task...")
        try:
            # load audio file, extract MFCCs from real wave, clear audio file
            self._step_begin(u"extract MFCC real wave")
            real_wave_mfcc = self._extract_mfcc(
                file_path=self.task.audio_file_path_absolute,
                file_format=None,
            )
            self._step_end()

            # compute head and/or tail and set it
            self._step_begin(u"compute head tail")
            (head_length, process_length, tail_length) = self._compute_head_process_tail(real_wave_mfcc)
            real_wave_mfcc.set_head_middle_tail(head_length, process_length, tail_length)
            self._step_end()

            # compute alignment, outputting a tree of time intervals
            self._set_synthesizer()
            sync_root = Tree()
            self._execute_inner(
                real_wave_mfcc,
                self.task.text_file,
                sync_root=sync_root,
                force_aba_auto=False,
                log=True,
                leaf_level=True
            )
            self._clear_cache_synthesizer()

            # create syncmap and add it to task
            self._step_begin(u"create sync map")
            self._create_sync_map(sync_root=sync_root)
            self._step_end()

            # log total
            self._step_total()
            self.log(u"Executing single level task... done")
        except Exception as exc:
            self._step_failure(exc)

    def _execute_multi_level_task(self):
        """ Execute a multi-level task """
        self.log(u"Executing multi level task...")

        self.log(u"Saving rconf...")
        # save original rconf
        orig_rconf = self.rconf.clone()
        # clone rconfs and set granularity
        # TODO the following code assumes 3 levels: generalize this
        level_rconfs = [None, self.rconf.clone(), self.rconf.clone(), self.rconf.clone()]
        level_mfccs = [None, None, None, None]
        force_aba_autos = [None, False, False, True]
        for i in range(1, len(level_rconfs)):
            level_rconfs[i].set_granularity(i)
            self.log([u"Level %d mmn: %s", i, level_rconfs[i].mmn])
            self.log([u"Level %d mwl: %.3f", i, level_rconfs[i].mwl])
            self.log([u"Level %d mws: %.3f", i, level_rconfs[i].mws])
            level_rconfs[i].set_tts(i)
            self.log([u"Level %d tts: %s", i, level_rconfs[i].tts])
            self.log([u"Level %d tts_path: %s", i, level_rconfs[i].tts_path])
        self.log(u"Saving rconf... done")
        try:
            self.log(u"Creating AudioFile object...")
            audio_file = self._load_audio_file()
            self.log(u"Creating AudioFile object... done")

            # extract MFCC for each level
            for i in range(1, len(level_rconfs)):
                self._step_begin(u"extract MFCC real wave level %d" % i)
                if (i == 1) or (level_rconfs[i].mws != level_rconfs[i - 1].mws) or (level_rconfs[i].mwl != level_rconfs[i - 1].mwl):
                    self.rconf = level_rconfs[i]
                    level_mfccs[i] = self._extract_mfcc(audio_file=audio_file)
                else:
                    self.log(u"Keeping MFCC real wave from previous level")
                    level_mfccs[i] = level_mfccs[i - 1]
                self._step_end()

            self.log(u"Clearing AudioFile object...")
            self.rconf = level_rconfs[1]
            self._clear_audio_file(audio_file)
            self.log(u"Clearing AudioFile object... done")

            # compute head tail for the entire real wave (level 1)
            self._step_begin(u"compute head tail")
            (head_length, process_length, tail_length) = self._compute_head_process_tail(level_mfccs[1])
            level_mfccs[1].set_head_middle_tail(head_length, process_length, tail_length)
            self._step_end()

            # compute alignment at each level
            sync_root = Tree()
            sync_roots = [sync_root]
            text_files = [self.task.text_file]
            number_levels = len(level_rconfs)
            for i in range(1, number_levels):
                self._step_begin(u"compute alignment level %d" % i)
                self.rconf = level_rconfs[i]
                text_files, sync_roots = self._execute_level(
                    level=i,
                    audio_file_mfcc=level_mfccs[i],
                    text_files=text_files,
                    sync_roots=sync_roots,
                    force_aba_auto=force_aba_autos[i],
                )
                self._step_end()

            # restore original rconf, and create syncmap and add it to task
            self._step_begin(u"create sync map")
            self.rconf = orig_rconf
            self._create_sync_map(sync_root=sync_root)
            self._step_end()

            self._step_total()
            self.log(u"Executing multi level task... done")
        except Exception as exc:
            self._step_failure(exc)

    def _execute_level(self, level, audio_file_mfcc, text_files, sync_roots, force_aba_auto=False):
        """
        Compute the alignment for all the nodes in the given level.

        Return a pair (next_level_text_files, next_level_sync_roots),
        containing two lists of text file subtrees and sync map subtrees
        on the next level.

        :param int level: the level
        :param audio_file_mfcc: the audio MFCC representation for this level
        :type  audio_file_mfcc: :class:`~aeneas.audiofilemfcc.AudioFileMFCC`
        :param list text_files: a list of :class:`~aeneas.textfile.TextFile` objects,
                                each representing a (sub)tree of the Task text file
        :param list sync_roots: a list of :class:`~aeneas.tree.Tree` objects,
                                each representing a SyncMapFragment tree,
                                one for each element in ``text_files``
        :param bool force_aba_auto: if ``True``, force using the AUTO ABA algorithm
        :rtype: (list, list)
        """
        self._set_synthesizer()
        next_level_text_files = []
        next_level_sync_roots = []
        for text_file_index, text_file in enumerate(text_files):
            self.log([u"Text level %d, fragment %d", level, text_file_index])
            self.log([u"  Len:   %d", len(text_file)])
            sync_root = sync_roots[text_file_index]
            if (level > 1) and (len(text_file) == 1):
                self.log(u"Level > 1 and only one text fragment => return trivial tree")
                self._append_trivial_tree(text_file, sync_root)
            elif (level > 1) and (sync_root.value.begin == sync_root.value.end):
                self.log(u"Level > 1 and parent has begin == end => return trivial tree")
                self._append_trivial_tree(text_file, sync_root)
            else:
                self.log(u"Level == 1 or more than one text fragment with non-zero parent => compute tree")
                if not sync_root.is_empty:
                    begin = sync_root.value.begin
                    end = sync_root.value.end
                    self.log([u"  Setting begin: %.3f", begin])
                    self.log([u"  Setting end:   %.3f", end])
                    audio_file_mfcc.set_head_middle_tail(head_length=begin, middle_length=(end - begin))
                else:
                    self.log(u"  No begin or end to set")
                self._execute_inner(
                    audio_file_mfcc,
                    text_file,
                    sync_root=sync_root,
                    force_aba_auto=force_aba_auto,
                    log=False,
                    leaf_level=(level == 3)
                )
            # store next level roots
            next_level_text_files.extend(text_file.children_not_empty)
            # we added head and tail, we must not pass them to the next level
            next_level_sync_roots.extend(sync_root.children[1:-1])
        self._clear_cache_synthesizer()
        return (next_level_text_files, next_level_sync_roots)

    def _execute_inner(self, audio_file_mfcc, text_file, sync_root=None, force_aba_auto=False, log=True, leaf_level=False):
        """
        Align a subinterval of the given AudioFileMFCC
        with the given TextFile.

        Return the computed tree of time intervals,
        rooted at ``sync_root`` if the latter is not ``None``,
        or as a new ``Tree`` otherwise.

        The begin and end positions inside the AudioFileMFCC
        must have been set ahead by the caller.

        The text fragments being aligned are the vchildren of ``text_file``.

        :param audio_file_mfcc: the audio file MFCC representation
        :type  audio_file_mfcc: :class:`~aeneas.audiofilemfcc.AudioFileMFCC`
        :param text_file: the text file subtree to align
        :type  text_file: :class:`~aeneas.textfile.TextFile`
        :param sync_root: the tree node to which fragments should be appended
        :type  sync_root: :class:`~aeneas.tree.Tree`
        :param bool force_aba_auto: if ``True``, do not run aba algorithm
        :param bool log: if ``True``, log steps
        :param bool leaf_level: alert aba if the computation is at a leaf level
        :rtype: :class:`~aeneas.tree.Tree`
        """
        self._step_begin(u"synthesize text", log=log)
        synt_handler, synt_path, synt_anchors, synt_format = self._synthesize(text_file)
        self._step_end(log=log)

        self._step_begin(u"extract MFCC synt wave", log=log)
        synt_wave_mfcc = self._extract_mfcc(
            file_path=synt_path,
            file_format=synt_format,
        )
        gf.delete_file(synt_handler, synt_path)
        self._step_end(log=log)

        self._step_begin(u"align waves", log=log)
        indices = self._align_waves(audio_file_mfcc, synt_wave_mfcc, synt_anchors)
        self._step_end(log=log)

        self._step_begin(u"adjust boundaries", log=log)
        self._adjust_boundaries(indices, text_file, audio_file_mfcc, sync_root, force_aba_auto, leaf_level)
        self._step_end(log=log)

    def _load_audio_file(self):
        """
        Load audio in memory.

        :rtype: :class:`~aeneas.audiofile.AudioFile`
        """
        self._step_begin(u"load audio file")
        # NOTE file_format=None forces conversion to
        #      PCM16 mono WAVE with default sample rate
        audio_file = AudioFile(
            file_path=self.task.audio_file_path_absolute,
            file_format=None,
            rconf=self.rconf,
            logger=self.logger
        )
        audio_file.read_samples_from_file()
        self._step_end()
        return audio_file

    def _clear_audio_file(self, audio_file):
        """
        Clear audio from memory.

        :param audio_file: the object to clear
        :type  audio_file: :class:`~aeneas.audiofile.AudioFile`
        """
        self._step_begin(u"clear audio file")
        audio_file.clear_data()
        audio_file = None
        self._step_end()

    def _extract_mfcc(self, file_path=None, file_format=None, audio_file=None):
        """
        Extract the MFCCs from the given audio file.

        :rtype: :class:`~aeneas.audiofilemfcc.AudioFileMFCC`
        """
        audio_file_mfcc = AudioFileMFCC(
            file_path=file_path,
            file_format=file_format,
            audio_file=audio_file,
            rconf=self.rconf,
            logger=self.logger
        )
        if self.rconf.mmn:
            self.log(u"Running VAD inside _extract_mfcc...")
            audio_file_mfcc.run_vad(
                log_energy_threshold=self.rconf[RuntimeConfiguration.MFCC_MASK_LOG_ENERGY_THRESHOLD],
                min_nonspeech_length=self.rconf[RuntimeConfiguration.MFCC_MASK_MIN_NONSPEECH_LENGTH],
                extend_before=self.rconf[RuntimeConfiguration.MFCC_MASK_EXTEND_SPEECH_INTERVAL_BEFORE],
                extend_after=self.rconf[RuntimeConfiguration.MFCC_MASK_EXTEND_SPEECH_INTERVAL_AFTER]
            )
            self.log(u"Running VAD inside _extract_mfcc... done")
        return audio_file_mfcc

    def _compute_head_process_tail(self, audio_file_mfcc):
        """
        Set the audio file head or tail,
        by either reading the explicit values
        from the Task configuration,
        or using SD to determine them.

        This function returns the lengths, in seconds,
        of the (head, process, tail).

        :rtype: tuple (float, float, float)
        """
        head_length = self.task.configuration["i_a_head"]
        process_length = self.task.configuration["i_a_process"]
        tail_length = self.task.configuration["i_a_tail"]
        head_max = self.task.configuration["i_a_head_max"]
        head_min = self.task.configuration["i_a_head_min"]
        tail_max = self.task.configuration["i_a_tail_max"]
        tail_min = self.task.configuration["i_a_tail_min"]
        if (
            (head_length is not None) or
            (process_length is not None) or
            (tail_length is not None)
        ):
            self.log(u"Setting explicit head process tail")
        else:
            self.log(u"Detecting head tail...")
            sd = SD(audio_file_mfcc, self.task.text_file, rconf=self.rconf, logger=self.logger)
            head_length = TimeValue("0.000")
            process_length = None
            tail_length = TimeValue("0.000")
            if (head_min is not None) or (head_max is not None):
                self.log(u"Detecting HEAD...")
                head_length = sd.detect_head(head_min, head_max)
                self.log([u"Detected HEAD: %.3f", head_length])
                self.log(u"Detecting HEAD... done")
            if (tail_min is not None) or (tail_max is not None):
                self.log(u"Detecting TAIL...")
                tail_length = sd.detect_tail(tail_min, tail_max)
                self.log([u"Detected TAIL: %.3f", tail_length])
                self.log(u"Detecting TAIL... done")
            self.log(u"Detecting head tail... done")
        self.log([u"Head:    %s", gf.safe_float(head_length, None)])
        self.log([u"Process: %s", gf.safe_float(process_length, None)])
        self.log([u"Tail:    %s", gf.safe_float(tail_length, None)])
        return (head_length, process_length, tail_length)

    def _set_synthesizer(self):
        """ Create synthesizer """
        self.log(u"Setting synthesizer...")
        self.synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger)
        self.log(u"Setting synthesizer... done")

    def _clear_cache_synthesizer(self):
        """ Clear the cache of the synthesizer """
        self.log(u"Clearing synthesizer...")
        self.synthesizer.clear_cache()
        self.log(u"Clearing synthesizer... done")

    def _synthesize(self, text_file):
        """
        Synthesize text into a WAVE file.

        Return a tuple consisting of:

        1. the handler of the generated audio file
        2. the path of the generated audio file
        3. the list of anchors, that is, a list of floats
           each representing the start time of the corresponding
           text fragment in the generated wave file
           ``[start_1, start_2, ..., start_n]``
        4. a tuple describing the format of the audio file

        :param text_file: the text to be synthesized
        :type  text_file: :class:`~aeneas.textfile.TextFile`
        :rtype: tuple (handler, string, list)
        """
        handler, path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
        result = self.synthesizer.synthesize(text_file, path)
        return (handler, path, result[0], self.synthesizer.output_audio_format)

    def _align_waves(self, real_wave_mfcc, synt_wave_mfcc, synt_anchors):
        """
        Align two AudioFileMFCC objects,
        representing WAVE files.

        Return a list of boundary indices.
        """
        self.log(u"Creating DTWAligner...")
        aligner = DTWAligner(
            real_wave_mfcc,
            synt_wave_mfcc,
            rconf=self.rconf,
            logger=self.logger
        )
        self.log(u"Creating DTWAligner... done")
        self.log(u"Computing boundary indices...")
        boundary_indices = aligner.compute_boundaries(synt_anchors)
        self.log(u"Computing boundary indices... done")
        return boundary_indices

    def _adjust_boundaries(self, boundary_indices, text_file, real_wave_mfcc, sync_root, force_aba_auto=False, leaf_level=False):
        """
        Adjust boundaries as requested by the user.

        Return the computed time map, that is,
        a list of pairs ``[start_time, end_time]``,
        of length equal to number of fragments + 2,
        where the two extra elements are for
        the HEAD (first) and TAIL (last).
        """
        # boundary_indices contains the boundary indices in the all_mfcc of real_wave_mfcc
        # starting with the (head-1st fragment) and ending with (-1th fragment-tail)
        aba_parameters = self.task.configuration.aba_parameters()
        if force_aba_auto:
            self.log(u"Forced running algorithm: 'auto'")
            aba_parameters["algorithm"] = (AdjustBoundaryAlgorithm.AUTO, [])
            # note that the other aba settings (nonspeech and nozero)
            # remain as specified by the user
        self.log([u"ABA parameters: %s", aba_parameters])
        aba = AdjustBoundaryAlgorithm(rconf=self.rconf, logger=self.logger)
        aba.adjust(
            aba_parameters=aba_parameters,
            real_wave_mfcc=real_wave_mfcc,
            boundary_indices=boundary_indices,
            text_file=text_file,
            allow_arbitrary_shift=leaf_level
        )
        aba.append_fragment_list_to_sync_root(sync_root=sync_root)

    def _append_trivial_tree(self, text_file, sync_root):
        """
        Append trivial tree, made by one HEAD,
        one sync map fragment for each element of ``text_file``,
        and one TAIL.

        This function is called if either ``text_file`` has only one element,
        or if ``sync_root.value`` is an interval with zero length
        (i.e., ``sync_root.value.begin == sync_root.value.end``).
        """
        interval = sync_root.value
        #
        # NOTE the following is correct, but it is a bit obscure
        # time_values = [interval.begin] * (1 + len(text_file)) + [interval.end] * 2
        #
        if len(text_file) == 1:
            time_values = [interval.begin, interval.begin, interval.end, interval.end]
        else:
            # interval.begin == interval.end
            time_values = [interval.begin] * (3 + len(text_file))
        aba = AdjustBoundaryAlgorithm(rconf=self.rconf, logger=self.logger)
        aba.intervals_to_fragment_list(
            text_file=text_file,
            time_values=time_values
        )
        aba.append_fragment_list_to_sync_root(sync_root=sync_root)

    def _create_sync_map(self, sync_root):
        """
        If requested, check that the computed sync map is consistent.
        Then, add it to the Task.
        """
        sync_map = SyncMap(tree=sync_root, rconf=self.rconf, logger=self.logger)
        if self.rconf.safety_checks:
            self.log(u"Running sanity check on computed sync map...")
            if not sync_map.leaves_are_consistent:
                self._step_failure(ValueError(u"The computed sync map contains inconsistent fragments"))
            self.log(u"Running sanity check on computed sync map... passed")
        else:
            self.log(u"Not running sanity check on computed sync map")
        self.task.sync_map = sync_map
Exemple #17
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 4:
            return self.print_help()
        text_format = gf.safe_unicode(self.actual_arguments[0])
        if text_format == u"list":
            text = gf.safe_unicode(self.actual_arguments[1])
        elif text_format in TextFileFormat.ALLOWED_VALUES:
            text = self.actual_arguments[1]
            if not self.check_input_file(text):
                return self.ERROR_EXIT_CODE
        else:
            return self.print_help()

        l1_id_regex = self.has_option_with_value(u"--l1-id-regex")
        l2_id_regex = self.has_option_with_value(u"--l2-id-regex")
        l3_id_regex = self.has_option_with_value(u"--l3-id-regex")
        id_regex = self.has_option_with_value(u"--id-regex")
        class_regex = self.has_option_with_value(u"--class-regex")
        sort = self.has_option_with_value(u"--sort")
        backwards = self.has_option([u"-b", u"--backwards"])
        quit_after = gf.safe_float(self.has_option_with_value(u"--quit-after"),
                                   None)
        start_fragment = gf.safe_int(self.has_option_with_value(u"--start"),
                                     None)
        end_fragment = gf.safe_int(self.has_option_with_value(u"--end"), None)
        parameters = {
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX: l1_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX: l2_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX: l3_id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX: class_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: sort,
        }
        if (text_format == TextFileFormat.MUNPARSED) and (
            (l1_id_regex is None) or (l2_id_regex is None) or
            (l3_id_regex is None)):
            self.print_error(
                u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format"
            )
            return self.ERROR_EXIT_CODE
        if (text_format == TextFileFormat.UNPARSED) and (
                id_regex is None) and (class_regex is None):
            self.print_error(
                u"You must specify --id-regex and/or --class-regex for unparsed format"
            )
            return self.ERROR_EXIT_CODE

        language = gf.safe_unicode(self.actual_arguments[2])

        output_file_path = self.actual_arguments[3]
        if not self.check_output_file(output_file_path):
            return self.ERROR_EXIT_CODE

        text_file = self.get_text_file(text_format, text, parameters)
        if text_file is None:
            self.print_error(
                u"Unable to build a TextFile from the given parameters")
            return self.ERROR_EXIT_CODE
        elif len(text_file) == 0:
            self.print_error(u"No text fragments found")
            return self.ERROR_EXIT_CODE
        text_file.set_language(language)
        self.print_info(u"Read input text with %d fragments" %
                        (len(text_file)))
        if start_fragment is not None:
            self.print_info(u"Slicing from index %d" % (start_fragment))
        if end_fragment is not None:
            self.print_info(u"Slicing to index %d" % (end_fragment))
        text_slice = text_file.get_slice(start_fragment, end_fragment)
        self.print_info(u"Synthesizing %d fragments" % (len(text_slice)))

        if quit_after is not None:
            self.print_info(u"Stop synthesizing upon reaching %.3f seconds" %
                            (quit_after))

        try:
            synt = Synthesizer(rconf=self.rconf, logger=self.logger)
            synt.synthesize(text_slice,
                            output_file_path,
                            quit_after=quit_after,
                            backwards=backwards)
            self.print_success(u"Created file '%s'" % output_file_path)
            synt.clear_cache()
            return self.NO_ERROR_EXIT_CODE
        except ImportError as exc:
            tts = self.rconf[RuntimeConfiguration.TTS]
            if tts == Synthesizer.AWS:
                self.print_error(
                    u"You need to install Python module boto3 to use the AWS Polly TTS API wrapper. Run:"
                )
                self.print_error(u"$ pip install boto3")
                self.print_error(u"or, to install for all users:")
                self.print_error(u"$ sudo pip install boto3")
            elif tts == Synthesizer.NUANCE:
                self.print_error(
                    u"You need to install Python module requests to use the Nuance TTS API wrapper. Run:"
                )
                self.print_error(u"$ pip install requests")
                self.print_error(u"or, to install for all users:")
                self.print_error(u"$ sudo pip install requests")
            else:
                self.print_error(
                    u"An unexpected error occurred while synthesizing text:")
                self.print_error(u"%s" % exc)
        except Exception as exc:
            self.print_error(
                u"An unexpected error occurred while synthesizing text:")
            self.print_error(u"%s" % exc)

        return self.ERROR_EXIT_CODE
Exemple #18
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 4:
            return self.print_help()
        text_format = gf.safe_unicode(self.actual_arguments[0])
        if text_format == u"list":
            text = gf.safe_unicode(self.actual_arguments[1])
        elif text_format in TextFileFormat.ALLOWED_VALUES:
            text = self.actual_arguments[1]
            if not self.check_input_file(text):
                return self.ERROR_EXIT_CODE
        else:
            return self.print_help()

        l1_id_regex = self.has_option_with_value(u"--l1-id-regex")
        l2_id_regex = self.has_option_with_value(u"--l2-id-regex")
        l3_id_regex = self.has_option_with_value(u"--l3-id-regex")
        id_regex = self.has_option_with_value(u"--id-regex")
        class_regex = self.has_option_with_value(u"--class-regex")
        sort = self.has_option_with_value(u"--sort")
        backwards = self.has_option([u"-b", u"--backwards"])
        quit_after = gf.safe_float(self.has_option_with_value(u"--quit-after"), None)
        start_fragment = gf.safe_int(self.has_option_with_value(u"--start"), None)
        end_fragment = gf.safe_int(self.has_option_with_value(u"--end"), None)
        parameters = {
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX: l1_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX: l2_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX: l3_id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX: class_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: sort,
        }
        if (text_format == TextFileFormat.MUNPARSED) and ((l1_id_regex is None) or (l2_id_regex is None) or (l3_id_regex is None)):
            self.print_error(u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format")
            return self.ERROR_EXIT_CODE
        if (text_format == TextFileFormat.UNPARSED) and (id_regex is None) and (class_regex is None):
            self.print_error(u"You must specify --id-regex and/or --class-regex for unparsed format")
            return self.ERROR_EXIT_CODE

        language = gf.safe_unicode(self.actual_arguments[2])

        output_file_path = self.actual_arguments[3]
        if not self.check_output_file(output_file_path):
            return self.ERROR_EXIT_CODE

        text_file = self.get_text_file(text_format, text, parameters)
        if text_file is None:
            self.print_error(u"Unable to build a TextFile from the given parameters")
            return self.ERROR_EXIT_CODE
        elif len(text_file) == 0:
            self.print_error(u"No text fragments found")
            return self.ERROR_EXIT_CODE
        text_file.set_language(language)
        self.print_info(u"Read input text with %d fragments" % (len(text_file)))
        if start_fragment is not None:
            self.print_info(u"Slicing from index %d" % (start_fragment))
        if end_fragment is not None:
            self.print_info(u"Slicing to index %d" % (end_fragment))
        text_slice = text_file.get_slice(start_fragment, end_fragment)
        self.print_info(u"Synthesizing %d fragments" % (len(text_slice)))

        if quit_after is not None:
            self.print_info(u"Stop synthesizing upon reaching %.3f seconds" % (quit_after))

        try:
            synt = Synthesizer(rconf=self.rconf, logger=self.logger)
            synt.synthesize(
                text_slice,
                output_file_path,
                quit_after=quit_after,
                backwards=backwards
            )
            self.print_success(u"Created file '%s'" % output_file_path)
            synt.clear_cache()
            return self.NO_ERROR_EXIT_CODE
        except ImportError as exc:
            tts = self.rconf[RuntimeConfiguration.TTS]
            if tts == Synthesizer.AWS:
                self.print_error(u"You need to install Python module boto3 to use the AWS Polly TTS API wrapper. Run:")
                self.print_error(u"$ pip install boto3")
                self.print_error(u"or, to install for all users:")
                self.print_error(u"$ sudo pip install boto3")
            elif tts == Synthesizer.NUANCE:
                self.print_error(u"You need to install Python module requests to use the Nuance TTS API wrapper. Run:")
                self.print_error(u"$ pip install requests")
                self.print_error(u"or, to install for all users:")
                self.print_error(u"$ sudo pip install requests")
            else:
                self.print_error(u"An unexpected error occurred while synthesizing text:")
                self.print_error(u"%s" % exc)
        except Exception as exc:
            self.print_error(u"An unexpected error occurred while synthesizing text:")
            self.print_error(u"%s" % exc)

        return self.ERROR_EXIT_CODE
Exemple #19
0
    def _detect_start(self, min_start_length, max_start_length, metric, backwards=False):
        """ Detect start """

        self._log(["Min start length: %.3f", min_start_length])
        self._log(["Max start length: %.3f", max_start_length])
        self._log(["Metric:           %s", metric])
        self._log(["Backwards:        %s", str(backwards)])

        audio_rate = self.text_file.characters / self.audio_file.audio_length
        self._log(["Audio rate:     %.3f", audio_rate])

        self._log("Synthesizing query...")
        tmp_handler, tmp_file_path = tempfile.mkstemp(
            suffix=".wav",
            dir=gf.custom_tmp_dir()
        )
        synt = Synthesizer(logger=self.logger)
        synt_duration = max_start_length * self.QUERY_FACTOR
        self._log(["Synthesizing %.3f seconds", synt_duration])
        result = synt.synthesize(
            self.text_file,
            tmp_file_path,
            quit_after=synt_duration,
            backwards=backwards
        )
        self._log("Synthesizing query... done")

        query_file = AudioFile(tmp_file_path)
        if backwards:
            self._log("Reversing query")
            query_file.reverse()
        self._log("Extracting MFCCs for query...")
        query_file.extract_mfcc(frame_rate=self.frame_rate)
        query_file.clear_data()
        self._log("Extracting MFCCs for query... done")

        self._log("Cleaning up...")
        self._cleanup(tmp_handler, tmp_file_path)
        self._log("Cleaning up... done")

        query_characters = result[2]
        query_len = query_file.audio_length
        query_mfcc = query_file.audio_mfcc
        query_rate = query_characters / query_len

        stretch_factor = max(1, query_rate / audio_rate)
        self._log(["Audio rate:     %.3f", audio_rate])
        self._log(["Query rate:     %.3f", query_rate])
        self._log(["Stretch factor: %.3f", stretch_factor])

        audio_mfcc = self.audio_file.audio_mfcc
        self._log(["Actual audio has %d frames", audio_mfcc.shape[1]])
        audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate)
        self._log(["Limiting audio to first %d frames", audio_mfcc_end_index])
        audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1])
        audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index]
        self._log(["Limited audio has %d frames", audio_mfcc.shape[1]])

        l, o = audio_mfcc.shape
        l, n = query_mfcc.shape

        # minimum length of a matched interval in the real audio
        stretched_match_minimum_length = int(n * stretch_factor)

        self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)])
        self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)])
        self._log(["Stretch factor:          %.3f", stretch_factor])
        self._log(["Required minimum length: %.3f", stretched_match_minimum_length])
        self._log("Speech intervals:")
        for interval in self.audio_speech:
            self._log(["  %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]])

        admissible_intervals = [x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length))]
        self._log("AdmissibleSpeech intervals:")
        for interval in admissible_intervals:
            self._log(["  %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]])

        candidates = []
        runs_with_min_length = 0
        runs_no_improvement = 0
        runs_min_distortion = numpy.inf
        runs_min_value = numpy.inf

        for interval in admissible_intervals:
            if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT:
                self._log("  Breaking: too many runs without improvement")
                break

            if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH:
                self._log("  Breaking: too many runs with minimum required length")
                break

            start_time = interval[0]
            start_index = self._t2i(start_time)
            self._log(["Evaluating interval starting at %d == %.3f ", start_index, start_time])
            if start_index > o:
                self._log("  Breaking: start index outside audio window")
                break

            req_end_index = start_index + stretched_match_minimum_length
            req_end_time = self._i2t(req_end_index)
            if req_end_index > o:
                self._log("  Breaking: not enough audio left in shifted window")
                break
            end_index = min(start_index + 2 * n, o)
            end_time = self._i2t(end_index)

            self._log(["  Start   %d == %.3f", start_index, start_time])
            self._log(["  Req end %d == %.3f", req_end_index, req_end_time])
            self._log(["  Eff end %d == %.3f", end_index, end_time])

            audio_mfcc_sub = audio_mfcc[:, start_index:end_index]
            l, m = audio_mfcc_sub.shape

            self._log("Computing DTW...")
            aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger)
            aligner.real_wave_full_mfcc = audio_mfcc_sub
            aligner.synt_wave_full_mfcc = query_mfcc
            aligner.real_wave_length = self._i2t(m)
            aligner.synt_wave_length = self._i2t(n)
            acm = aligner.compute_accumulated_cost_matrix()
            # transpose, so we have an n x m accumulated cost matrix
            acm = acm.transpose()
            last_row = acm[-1, :]
            self._log("Computing DTW... done")

            # find the minimum, but its index must be >= stretched_match_minimum_length
            candidate_argmin_index = numpy.argmin(last_row[stretched_match_minimum_length:])
            candidate_length_index = stretched_match_minimum_length + candidate_argmin_index
            candidate_length_time = self._i2t(candidate_length_index)
            candidate_value = last_row[candidate_length_index]
            candidate_end_index = start_index + candidate_length_index
            candidate_end_time = self._i2t(candidate_end_index)
            candidate_distortion = candidate_value / candidate_length_index

            # check if the candidate has minimum length
            if candidate_length_index == stretched_match_minimum_length:
                runs_with_min_length += 1
            else:
                runs_with_min_length = 0

            # check if the candidate improved the global minimum value
            if metric == SDMetric.VALUE:
                if candidate_value < runs_min_value:
                    runs_min_value = candidate_value
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1
            if metric == SDMetric.DISTORTION:
                if candidate_distortion < runs_min_distortion:
                    runs_min_distortion = candidate_distortion
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1

            # append to the list of candidates
            self._log(["    Interval  start:      %d == %.6f", start_index, start_time])
            self._log(["    Interval  end:        %d == %.6f", end_index, end_time])
            self._log(["    Candidate start:      %d == %.6f", start_index, start_time])
            self._log(["    Candidate end:        %d == %.6f", candidate_end_index, candidate_end_time])
            self._log(["    Candidate length:     %d == %.6f", candidate_length_index, candidate_length_time])
            self._log(["    Candidate value:      %.6f", candidate_value])
            self._log(["    Candidate distortion: %.6f", candidate_distortion])
            candidates.append({
                "start_index": start_index,
                "length": candidate_length_index,
                "value": candidate_value,
                "distortion": candidate_distortion
            })

        # select best candidate and return its start time
        # if we have no best candidate, return 0.0
        best_candidate = self._select_best_candidate(candidates, metric)
        if best_candidate is None:
            return 0.0
        sd_time = self._i2t(max(best_candidate["start_index"], 0))
        self._log(["Returning time %.3f", sd_time])
        return sd_time
Exemple #20
0
def main():
    """ Entry point """
    if len(sys.argv) < 5:
        usage()
        return
    language = sys.argv[1]
    text_file_path = sys.argv[2]
    text_format = sys.argv[3]
    audio_file_path = sys.argv[-1]
    backwards = False
    quit_after = None
    parameters = {}

    for i in range(4, len(sys.argv) - 1):
        args = sys.argv[i].split("=")
        if len(args) == 1:
            backwards = (args[0] in ["b", "-b", "backwards", "--backwards"])
        if len(args) == 2:
            key, value = args
            if key == "id_regex":
                parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX] = value
            if key == "class_regex":
                parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX] = value
            if key == "sort":
                parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT] = value
            if (key == "start") or (key == "end"):
                try:
                    parameters[key] = int(value)
                except:
                    pass
            if key == "quit_after":
                quit_after = float(value)

    if text_format == "list":
        text_file = TextFile()
        text_file.read_from_list(text_file_path.split("|"))
    else:
        text_file = TextFile(text_file_path, text_format, parameters)
    text_file.set_language(language)

    start_fragment = None
    if "start" in parameters:
        start_fragment = parameters["start"]

    end_fragment = None
    if "end" in parameters:
        end_fragment = parameters["end"]

    print "[INFO] Read input text file with %d fragments" % (len(text_file))
    if start_fragment is not None:
        print "[INFO] Slicing from index %d" % (start_fragment)
    if end_fragment is not None:
        print "[INFO] Slicing to index %d" % (end_fragment)
    text_slice = text_file.get_slice(start_fragment, end_fragment)
    print "[INFO] Synthesizing %d fragments" % (len(text_slice))
    if quit_after is not None:
        print "[INFO] Stop synthesizing after reaching %.3f seconds" % (
            quit_after)
    if backwards:
        print "[INFO] Synthesizing backwards"
    synt = Synthesizer()
    synt.synthesize(text_slice, audio_file_path, quit_after, backwards)
    print "[INFO] Created file '%s'" % audio_file_path
Exemple #21
0
 def test_synthesize_path_not_writeable(self):
     tfl = TextFile()
     synth = Synthesizer()
     with self.assertRaises(OSError):
         synth.synthesize(tfl, self.PATH_NOT_WRITEABLE)
Exemple #22
0
 def test_synthesize_invalid_text_file(self):
     synth = Synthesizer()
     with self.assertRaises(TypeError):
         synth.synthesize("foo", self.PATH_NOT_WRITEABLE)
 def test_synthesize_invalid_text_file(self):
     synth = Synthesizer()
     with self.assertRaises(TypeError):
         synth.synthesize("foo", self.PATH_NOT_WRITEABLE)
Exemple #24
0
    def _detect(self, min_length, max_length, tail=False):
        """
        Detect the head or tail within ``min_length`` and ``max_length`` duration.

        If detecting the tail, the real wave MFCC and the query are reversed
        so that the tail detection problem reduces to a head detection problem.

        Return the duration of the head or tail, in seconds.

        :param min_length: estimated minimum length
        :type  min_length: :class:`~aeneas.exacttiming.TimeValue`
        :param max_length: estimated maximum length
        :type  max_length: :class:`~aeneas.exacttiming.TimeValue`
        :rtype: :class:`~aeneas.exacttiming.TimeValue`
        :raises: TypeError: if one of the parameters is not ``None`` or a number
        :raises: ValueError: if one of the parameters is negative
        """
        def _sanitize(value, default, name):
            if value is None:
                value = default
            try:
                value = TimeValue(value)
            except (TypeError, ValueError, InvalidOperation) as exc:
                self.log_exc(u"The value of %s is not a number" % (name), exc, True, TypeError)
            if value < 0:
                self.log_exc(u"The value of %s is negative" % (name), None, True, ValueError)
            return value

        min_length = _sanitize(min_length, self.MIN_LENGTH, "min_length")
        max_length = _sanitize(max_length, self.MAX_LENGTH, "max_length")
        mws = self.rconf.mws
        min_length_frames = int(min_length / mws)
        max_length_frames = int(max_length / mws)
        self.log([u"MFCC window shift s:     %.3f", mws])
        self.log([u"Min start length s:      %.3f", min_length])
        self.log([u"Min start length frames: %d", min_length_frames])
        self.log([u"Max start length s:      %.3f", max_length])
        self.log([u"Max start length frames: %d", max_length_frames])
        self.log([u"Tail?:                   %s", str(tail)])

        self.log(u"Synthesizing query...")
        synt_duration = max_length * self.QUERY_FACTOR
        self.log([u"Synthesizing at least %.3f seconds", synt_duration])
        tmp_handler, tmp_file_path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
        synt = Synthesizer(rconf=self.rconf, logger=self.logger)
        anchors, total_time, synthesized_chars = synt.synthesize(
            self.text_file,
            tmp_file_path,
            quit_after=synt_duration,
            backwards=tail
        )
        self.log(u"Synthesizing query... done")

        self.log(u"Extracting MFCCs for query...")
        query_mfcc = AudioFileMFCC(tmp_file_path, rconf=self.rconf, logger=self.logger)
        self.log(u"Extracting MFCCs for query... done")

        self.log(u"Cleaning up...")
        gf.delete_file(tmp_handler, tmp_file_path)
        self.log(u"Cleaning up... done")

        search_window = max_length * self.AUDIO_FACTOR
        search_window_end = min(int(search_window / mws), self.real_wave_mfcc.all_length)
        self.log([u"Query MFCC length (frames): %d", query_mfcc.all_length])
        self.log([u"Real MFCC length (frames):  %d", self.real_wave_mfcc.all_length])
        self.log([u"Search window end (s):      %.3f", search_window])
        self.log([u"Search window end (frames): %d", search_window_end])

        if tail:
            self.log(u"Tail => reversing real_wave_mfcc and query_mfcc")
            self.real_wave_mfcc.reverse()
            query_mfcc.reverse()

        # NOTE: VAD will be run here, if not done before
        speech_intervals = self.real_wave_mfcc.intervals(speech=True, time=False)
        if len(speech_intervals) < 1:
            self.log(u"No speech intervals, hence no start found")
            if tail:
                self.real_wave_mfcc.reverse()
            return TimeValue("0.000")

        # generate a list of begin indices
        search_end = None
        candidates_begin = []
        for interval in speech_intervals:
            if (interval[0] >= min_length_frames) and (interval[0] <= max_length_frames):
                candidates_begin.append(interval[0])
            search_end = interval[1]
            if search_end >= search_window_end:
                break

        # for each begin index, compute the acm cost
        # to match the query
        # note that we take the min over the last column of the acm
        # meaning that we allow to match the entire query wave
        # against a portion of the real wave
        candidates = []
        for candidate_begin in candidates_begin:
            self.log([u"Candidate interval starting at %d == %.3f", candidate_begin, candidate_begin * mws])
            try:
                rwm = AudioFileMFCC(
                    mfcc_matrix=self.real_wave_mfcc.all_mfcc[:, candidate_begin:search_end],
                    rconf=self.rconf,
                    logger=self.logger
                )
                dtw = DTWAligner(
                    real_wave_mfcc=rwm,
                    synt_wave_mfcc=query_mfcc,
                    rconf=self.rconf,
                    logger=self.logger
                )
                acm = dtw.compute_accumulated_cost_matrix()
                last_column = acm[:, -1]
                min_value = numpy.min(last_column)
                min_index = numpy.argmin(last_column)
                self.log([u"Candidate interval: %d %d == %.3f %.3f", candidate_begin, search_end, candidate_begin * mws, search_end * mws])
                self.log([u"  Min value: %.6f", min_value])
                self.log([u"  Min index: %d == %.3f", min_index, min_index * mws])
                candidates.append((min_value, candidate_begin, min_index))
            except Exception as exc:
                self.log_exc(u"An unexpected error occurred while running _detect", exc, False, None)

        # reverse again the real wave
        if tail:
            self.log(u"Tail => reversing real_wave_mfcc again")
            self.real_wave_mfcc.reverse()

        # return
        if len(candidates) < 1:
            self.log(u"No candidates found")
            return TimeValue("0.000")
        self.log(u"Candidates:")
        for candidate in candidates:
            self.log([u"  Value: %.6f Begin Time: %.3f Min Index: %d", candidate[0], candidate[1] * mws, candidate[2]])
        best = sorted(candidates)[0][1]
        self.log([u"Best candidate: %d == %.3f", best, best * mws])
        return best * mws
 def test_synthesize_path_not_writeable(self):
     tfl = TextFile()
     synth = Synthesizer()
     with self.assertRaises(OSError):
         synth.synthesize(tfl, self.PATH_NOT_WRITEABLE)
Exemple #26
0
 def _set_synthesizer(self):
     """ Create synthesizer """
     self.log(u"Setting synthesizer...")
     self.synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger)
     self.log(u"Setting synthesizer... done")
Exemple #27
0
    def _detect_start(self,
                      min_start_length,
                      max_start_length,
                      metric,
                      backwards=False):
        """ Detect start """

        self._log(["Min start length: %.3f", min_start_length])
        self._log(["Max start length: %.3f", max_start_length])
        self._log(["Metric:           %s", metric])
        self._log(["Backwards:        %s", str(backwards)])

        audio_rate = self.text_file.characters / self.audio_file.audio_length
        self._log(["Audio rate:     %.3f", audio_rate])

        self._log("Synthesizing query...")
        tmp_handler, tmp_file_path = tempfile.mkstemp(suffix=".wav",
                                                      dir=gf.custom_tmp_dir())
        synt = Synthesizer(logger=self.logger)
        synt_duration = max_start_length * self.QUERY_FACTOR
        self._log(["Synthesizing %.3f seconds", synt_duration])
        result = synt.synthesize(self.text_file,
                                 tmp_file_path,
                                 quit_after=synt_duration,
                                 backwards=backwards)
        self._log("Synthesizing query... done")

        query_file = AudioFile(tmp_file_path)
        if backwards:
            self._log("Reversing query")
            query_file.reverse()
        self._log("Extracting MFCCs for query...")
        query_file.extract_mfcc(frame_rate=self.frame_rate)
        query_file.clear_data()
        self._log("Extracting MFCCs for query... done")

        self._log("Cleaning up...")
        self._cleanup(tmp_handler, tmp_file_path)
        self._log("Cleaning up... done")

        query_characters = result[2]
        query_len = query_file.audio_length
        query_mfcc = query_file.audio_mfcc
        query_rate = query_characters / query_len

        stretch_factor = max(1, query_rate / audio_rate)
        self._log(["Audio rate:     %.3f", audio_rate])
        self._log(["Query rate:     %.3f", query_rate])
        self._log(["Stretch factor: %.3f", stretch_factor])

        audio_mfcc = self.audio_file.audio_mfcc
        self._log(["Actual audio has %d frames", audio_mfcc.shape[1]])
        audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR *
                                   self.frame_rate)
        self._log(["Limiting audio to first %d frames", audio_mfcc_end_index])
        audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1])
        audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index]
        self._log(["Limited audio has %d frames", audio_mfcc.shape[1]])

        l, o = audio_mfcc.shape
        l, n = query_mfcc.shape

        # minimum length of a matched interval in the real audio
        stretched_match_minimum_length = int(n * stretch_factor)

        self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)])
        self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)])
        self._log(["Stretch factor:          %.3f", stretch_factor])
        self._log(
            ["Required minimum length: %.3f", stretched_match_minimum_length])
        self._log("Speech intervals:")
        for interval in self.audio_speech:
            self._log([
                "  %d %d == %.3f %.3f",
                self._t2i(interval[0]),
                self._t2i(interval[1]), interval[0], interval[1]
            ])

        admissible_intervals = [
            x for x in self.audio_speech
            if ((x[0] >= min_start_length) and (x[0] <= max_start_length))
        ]
        self._log("AdmissibleSpeech intervals:")
        for interval in admissible_intervals:
            self._log([
                "  %d %d == %.3f %.3f",
                self._t2i(interval[0]),
                self._t2i(interval[1]), interval[0], interval[1]
            ])

        candidates = []
        runs_with_min_length = 0
        runs_no_improvement = 0
        runs_min_distortion = numpy.inf
        runs_min_value = numpy.inf

        for interval in admissible_intervals:
            if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT:
                self._log("  Breaking: too many runs without improvement")
                break

            if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH:
                self._log(
                    "  Breaking: too many runs with minimum required length")
                break

            start_time = interval[0]
            start_index = self._t2i(start_time)
            self._log([
                "Evaluating interval starting at %d == %.3f ", start_index,
                start_time
            ])
            if start_index > o:
                self._log("  Breaking: start index outside audio window")
                break

            req_end_index = start_index + stretched_match_minimum_length
            req_end_time = self._i2t(req_end_index)
            if req_end_index > o:
                self._log(
                    "  Breaking: not enough audio left in shifted window")
                break
            end_index = min(start_index + 2 * n, o)
            end_time = self._i2t(end_index)

            self._log(["  Start   %d == %.3f", start_index, start_time])
            self._log(["  Req end %d == %.3f", req_end_index, req_end_time])
            self._log(["  Eff end %d == %.3f", end_index, end_time])

            audio_mfcc_sub = audio_mfcc[:, start_index:end_index]
            l, m = audio_mfcc_sub.shape

            self._log("Computing DTW...")
            aligner = DTWAligner(None,
                                 None,
                                 frame_rate=self.frame_rate,
                                 logger=self.logger)
            aligner.real_wave_full_mfcc = audio_mfcc_sub
            aligner.synt_wave_full_mfcc = query_mfcc
            aligner.real_wave_length = self._i2t(m)
            aligner.synt_wave_length = self._i2t(n)
            acm = aligner.compute_accumulated_cost_matrix()
            # transpose, so we have an n x m accumulated cost matrix
            acm = acm.transpose()
            last_row = acm[-1, :]
            self._log("Computing DTW... done")

            # find the minimum, but its index must be >= stretched_match_minimum_length
            candidate_argmin_index = numpy.argmin(
                last_row[stretched_match_minimum_length:])
            candidate_length_index = stretched_match_minimum_length + candidate_argmin_index
            candidate_length_time = self._i2t(candidate_length_index)
            candidate_value = last_row[candidate_length_index]
            candidate_end_index = start_index + candidate_length_index
            candidate_end_time = self._i2t(candidate_end_index)
            candidate_distortion = candidate_value / candidate_length_index

            # check if the candidate has minimum length
            if candidate_length_index == stretched_match_minimum_length:
                runs_with_min_length += 1
            else:
                runs_with_min_length = 0

            # check if the candidate improved the global minimum value
            if metric == SDMetric.VALUE:
                if candidate_value < runs_min_value:
                    runs_min_value = candidate_value
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1
            if metric == SDMetric.DISTORTION:
                if candidate_distortion < runs_min_distortion:
                    runs_min_distortion = candidate_distortion
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1

            # append to the list of candidates
            self._log([
                "    Interval  start:      %d == %.6f", start_index, start_time
            ])
            self._log(
                ["    Interval  end:        %d == %.6f", end_index, end_time])
            self._log([
                "    Candidate start:      %d == %.6f", start_index, start_time
            ])
            self._log([
                "    Candidate end:        %d == %.6f", candidate_end_index,
                candidate_end_time
            ])
            self._log([
                "    Candidate length:     %d == %.6f", candidate_length_index,
                candidate_length_time
            ])
            self._log(["    Candidate value:      %.6f", candidate_value])
            self._log(["    Candidate distortion: %.6f", candidate_distortion])
            candidates.append({
                "start_index": start_index,
                "length": candidate_length_index,
                "value": candidate_value,
                "distortion": candidate_distortion
            })

        # select best candidate and return its start time
        # if we have no best candidate, return 0.0
        best_candidate = self._select_best_candidate(candidates, metric)
        if best_candidate is None:
            return 0.0
        sd_time = self._i2t(max(best_candidate["start_index"], 0))
        self._log(["Returning time %.3f", sd_time])
        return sd_time
Exemple #28
0
 def test_clear_cache(self):
     synth = Synthesizer()
     synth.clear_cache()
Exemple #29
0
    def _detect(self, min_length, max_length, tail=False):
        """
        Detect the head or tail within ``min_length`` and ``max_length`` duration.

        If detecting the tail, the real wave MFCC and the query are reversed
        so that the tail detection problem reduces to a head detection problem.

        Return the duration of the head or tail, in seconds.

        :param min_length: estimated minimum length
        :type  min_length: :class:`~aeneas.timevalue.TimeValue`
        :param max_length: estimated maximum length
        :type  max_length: :class:`~aeneas.timevalue.TimeValue`
        :rtype: :class:`~aeneas.timevalue.TimeValue`
        :raises: TypeError: if one of the parameters is not ``None`` or a number
        :raises: ValueError: if one of the parameters is negative
        """
        def _sanitize(value, default, name):
            if value is None:
                value = default
            try:
                value = TimeValue(value)
            except (TypeError, ValueError, InvalidOperation) as exc:
                self.log_exc(u"The value of %s is not a number" % (name), exc, True, TypeError)
            if value < 0:
                self.log_exc(u"The value of %s is negative" % (name), None, True, ValueError)
            return value

        min_length = _sanitize(min_length, self.MIN_LENGTH, "min_length")
        max_length = _sanitize(max_length, self.MAX_LENGTH, "max_length")
        mws = self.rconf.mws
        min_length_frames = int(min_length / mws)
        max_length_frames = int(max_length / mws)
        self.log([u"MFCC window shift s:     %.3f", mws])
        self.log([u"Min start length s:      %.3f", min_length])
        self.log([u"Min start length frames: %d", min_length_frames])
        self.log([u"Max start length s:      %.3f", max_length])
        self.log([u"Max start length frames: %d", max_length_frames])
        self.log([u"Tail?:                   %s", str(tail)])

        self.log(u"Synthesizing query...")
        synt_duration = max_length * self.QUERY_FACTOR
        self.log([u"Synthesizing at least %.3f seconds", synt_duration])
        tmp_handler, tmp_file_path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
        synt = Synthesizer(rconf=self.rconf, logger=self.logger)
        anchors, total_time, synthesized_chars = synt.synthesize(
            self.text_file,
            tmp_file_path,
            quit_after=synt_duration,
            backwards=tail
        )
        self.log(u"Synthesizing query... done")

        self.log(u"Extracting MFCCs for query...")
        query_mfcc = AudioFileMFCC(tmp_file_path, rconf=self.rconf, logger=self.logger)
        self.log(u"Extracting MFCCs for query... done")

        self.log(u"Cleaning up...")
        gf.delete_file(tmp_handler, tmp_file_path)
        self.log(u"Cleaning up... done")

        search_window = max_length * self.AUDIO_FACTOR
        search_window_end = min(int(search_window / mws), self.real_wave_mfcc.all_length)
        self.log([u"Query MFCC length (frames): %d", query_mfcc.all_length])
        self.log([u"Real MFCC length (frames):  %d", self.real_wave_mfcc.all_length])
        self.log([u"Search window end (s):      %.3f", search_window])
        self.log([u"Search window end (frames): %d", search_window_end])

        if tail:
            self.log(u"Tail => reversing real_wave_mfcc and query_mfcc")
            self.real_wave_mfcc.reverse()
            query_mfcc.reverse()

        # NOTE: VAD will be run here, if not done before
        speech_intervals = self.real_wave_mfcc.intervals(speech=True, time=False)
        if len(speech_intervals) < 1:
            self.log(u"No speech intervals, hence no start found")
            if tail:
                self.real_wave_mfcc.reverse()
            return TimeValue("0.000")

        # generate a list of begin indices
        search_end = None
        candidates_begin = []
        for interval in speech_intervals:
            if (interval[0] >= min_length_frames) and (interval[0] <= max_length_frames):
                candidates_begin.append(interval[0])
            search_end = interval[1]
            if search_end >= search_window_end:
                break

        # for each begin index, compute the acm cost
        # to match the query
        # note that we take the min over the last column of the acm
        # meaning that we allow to match the entire query wave
        # against a portion of the real wave
        candidates = []
        for candidate_begin in candidates_begin:
            self.log([u"Candidate interval starting at %d == %.3f", candidate_begin, candidate_begin * mws])
            try:
                rwm = AudioFileMFCC(
                    mfcc_matrix=self.real_wave_mfcc.all_mfcc[:, candidate_begin:search_end],
                    rconf=self.rconf,
                    logger=self.logger
                )
                dtw = DTWAligner(
                    real_wave_mfcc=rwm,
                    synt_wave_mfcc=query_mfcc,
                    rconf=self.rconf,
                    logger=self.logger
                )
                acm = dtw.compute_accumulated_cost_matrix()
                last_column = acm[:, -1]
                min_value = numpy.min(last_column)
                min_index = numpy.argmin(last_column)
                self.log([u"Candidate interval: %d %d == %.3f %.3f", candidate_begin, search_end, candidate_begin * mws, search_end * mws])
                self.log([u"  Min value: %.6f", min_value])
                self.log([u"  Min index: %d == %.3f", min_index, min_index * mws])
                candidates.append((min_value, candidate_begin, min_index))
            except Exception as exc:
                self.log_exc(u"An unexpected error occurred while running _detect", exc, False, None)

        # reverse again the real wave
        if tail:
            self.log(u"Tail => reversing real_wave_mfcc again")
            self.real_wave_mfcc.reverse()

        # return
        if len(candidates) < 1:
            self.log(u"No candidates found")
            return TimeValue("0.000")
        self.log(u"Candidates:")
        for candidate in candidates:
            self.log([u"  Value: %.6f Begin Time: %.3f Min Index: %d", candidate[0], candidate[1] * mws, candidate[2]])
        best = sorted(candidates)[0][1]
        self.log([u"Best candidate: %d == %.3f", best, best * mws])
        return best * mws
Exemple #30
0
class ExecuteTask(Loggable):
    """
    Execute a task, that is, compute the sync map for it.

    :param task: the task to be executed
    :type  task: :class:`~aeneas.task.Task`
    :param rconf: a runtime configuration
    :type  rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration`
    :param logger: the logger object
    :type  logger: :class:`~aeneas.logger.Logger`
    """

    TAG = u"ExecuteTask"

    def __init__(self, task=None, rconf=None, logger=None):
        super(ExecuteTask, self).__init__(rconf=rconf, logger=logger)
        self.task = task
        self.step_index = 1
        self.step_label = u""
        self.step_begin_time = None
        self.step_total = 0.000
        self.synthesizer = None
        if task is not None:
            self.load_task(self.task)

    def load_task(self, task):
        """
        Load the task from the given ``Task`` object.

        :param task: the task to load
        :type  task: :class:`~aeneas.task.Task`
        :raises: :class:`~aeneas.executetask.ExecuteTaskInputError`: if ``task`` is not an instance of :class:`~aeneas.task.Task`
        """
        if not isinstance(task, Task):
            self.log_exc(u"task is not an instance of Task", None, True,
                         ExecuteTaskInputError)
        self.task = task

    def _step_begin(self, label, log=True):
        """ Log begin of a step """
        if log:
            self.step_label = label
            self.step_begin_time = self.log(u"STEP %d BEGIN (%s)" %
                                            (self.step_index, label))

    def _step_end(self, log=True):
        """ Log end of a step """
        if log:
            step_end_time = self.log(u"STEP %d END (%s)" %
                                     (self.step_index, self.step_label))
            diff = (step_end_time - self.step_begin_time)
            diff = float(diff.seconds + diff.microseconds / 1000000.0)
            self.step_total += diff
            self.log(u"STEP %d DURATION %.3f (%s)" %
                     (self.step_index, diff, self.step_label))
            self.step_index += 1

    def _step_failure(self, exc):
        """ Log failure of a step """
        self.log_crit(u"STEP %d (%s) FAILURE" %
                      (self.step_index, self.step_label))
        self.step_index += 1
        self.log_exc(u"Unexpected error while executing task", exc, True,
                     ExecuteTaskExecutionError)

    def _step_total(self):
        """ Log total """
        self.log(u"STEP T DURATION %.3f" % (self.step_total))

    def execute(self):
        """
        Execute the task.
        The sync map produced will be stored inside the task object.

        :raises: :class:`~aeneas.executetask.ExecuteTaskInputError`: if there is a problem with the input parameters
        :raises: :class:`~aeneas.executetask.ExecuteTaskExecutionError`: if there is a problem during the task execution
        """
        self.log(u"Executing task...")

        # check that we have the AudioFile object
        if self.task.audio_file is None:
            self.log_exc(u"The task does not seem to have its audio file set",
                         None, True, ExecuteTaskInputError)
        if ((self.task.audio_file.audio_length is None)
                or (self.task.audio_file.audio_length <= 0)):
            self.log_exc(u"The task seems to have an invalid audio file", None,
                         True, ExecuteTaskInputError)
        task_max_audio_length = self.rconf[
            RuntimeConfiguration.TASK_MAX_AUDIO_LENGTH]
        if ((task_max_audio_length > 0) and
            (self.task.audio_file.audio_length > task_max_audio_length)):
            self.log_exc(
                u"The audio file of the task has length %.3f, more than the maximum allowed (%.3f)."
                % (self.task.audio_file.audio_length, task_max_audio_length),
                None, True, ExecuteTaskInputError)

        # check that we have the TextFile object
        if self.task.text_file is None:
            self.log_exc(u"The task does not seem to have its text file set",
                         None, True, ExecuteTaskInputError)
        if len(self.task.text_file) == 0:
            self.log_exc(u"The task text file seems to have no text fragments",
                         None, True, ExecuteTaskInputError)
        task_max_text_length = self.rconf[
            RuntimeConfiguration.TASK_MAX_TEXT_LENGTH]
        if ((task_max_text_length > 0)
                and (len(self.task.text_file) > task_max_text_length)):
            self.log_exc(
                u"The text file of the task has %d fragments, more than the maximum allowed (%d)."
                % (len(self.task.text_file), task_max_text_length), None, True,
                ExecuteTaskInputError)
        if self.task.text_file.chars == 0:
            self.log_exc(u"The task text file seems to have empty text", None,
                         True, ExecuteTaskInputError)

        self.log(u"Both audio and text input file are present")

        # execute
        self.step_index = 1
        self.step_total = 0.000
        if self.task.text_file.file_format in TextFileFormat.MULTILEVEL_VALUES:
            self._execute_multi_level_task()
        else:
            self._execute_single_level_task()
        self.log(u"Executing task... done")

    def _execute_single_level_task(self):
        """ Execute a single-level task """
        self.log(u"Executing single level task...")
        try:
            # load audio file, extract MFCCs from real wave, clear audio file
            self._step_begin(u"extract MFCC real wave")
            real_wave_mfcc = self._extract_mfcc(
                file_path=self.task.audio_file_path_absolute, file_format=None)
            self._step_end()

            # compute head and/or tail and set it
            self._step_begin(u"compute head tail")
            (head_length, process_length,
             tail_length) = self._compute_head_process_tail(real_wave_mfcc)
            real_wave_mfcc.set_head_middle_tail(head_length, process_length,
                                                tail_length)
            self._step_end()

            # compute a time map alignment
            self._set_synthesizer()
            time_map = self._execute_inner(real_wave_mfcc,
                                           self.task.text_file,
                                           adjust_boundaries=True,
                                           log=True)
            self._clear_cache_synthesizer()

            # convert time_map to tree and create syncmap and add it to task
            self._step_begin(u"create sync map")
            tree = self._level_time_map_to_tree(self.task.text_file, time_map)
            self.task.sync_map = self._create_syncmap(tree)
            self._step_end()

            # check for fragments with zero duration
            self._step_begin(u"check zero duration")
            self._check_no_zero(self.rconf.mws)
            self._step_end()

            # log total
            self._step_total()
            self.log(u"Executing single level task... done")
        except Exception as exc:
            self._step_failure(exc)

    def _execute_multi_level_task(self):
        """ Execute a multi-level task """
        self.log(u"Executing multi level task...")

        self.log(u"Saving rconf...")
        # save original rconf
        orig_rconf = self.rconf.clone()
        # clone rconfs and set granularity
        level_rconfs = [
            None,
            self.rconf.clone(),
            self.rconf.clone(),
            self.rconf.clone()
        ]
        level_mfccs = [None, None, None, None]
        for i in range(1, len(level_rconfs)):
            level_rconfs[i].set_granularity(i)
            self.log([u"Level %d mws: %.3f", i, level_rconfs[i].mws])
            self.log([u"Level %d mwl: %.3f", i, level_rconfs[i].mwl])
            level_rconfs[i].set_tts(i)
            self.log([u"Level %d tts: %s", i, level_rconfs[i].tts])
            self.log([u"Level %d tts_path: %s", i, level_rconfs[i].tts_path])
        self.log(u"Saving rconf... done")

        try:
            self.log(u"Creating AudioFile object...")
            audio_file = self._load_audio_file()
            self.log(u"Creating AudioFile object... done")

            # extract MFCC for each level
            for i in range(1, len(level_rconfs)):
                self._step_begin(u"extract MFCC real wave level %d" % i)
                if (i == 1) or (
                        level_rconfs[i].mws != level_rconfs[i - 1].mws) or (
                            level_rconfs[i].mwl != level_rconfs[i - 1].mwl):
                    self.rconf = level_rconfs[i]
                    level_mfccs[i] = self._extract_mfcc(audio_file=audio_file)
                else:
                    self.log(u"Keeping MFCC real wave from previous level")
                    level_mfccs[i] = level_mfccs[i - 1]
                self._step_end()

            self.log(u"Clearing AudioFile object...")
            self.rconf = level_rconfs[1]
            self._clear_audio_file(audio_file)
            self.log(u"Clearing AudioFile object... done")

            # compute head tail for the entire real wave (level 1)
            self._step_begin(u"compute head tail")
            (head_length, process_length,
             tail_length) = self._compute_head_process_tail(level_mfccs[1])
            level_mfccs[1].set_head_middle_tail(head_length, process_length,
                                                tail_length)
            self._step_end()

            # compute alignment at each level
            tree = Tree()
            sync_roots = [tree]
            text_files = [self.task.text_file]
            aht = [None, True, False, False]
            aba = [None, True, True, False]
            for i in range(1, len(level_rconfs)):
                self._step_begin(u"compute alignment level %d" % i)
                self.rconf = level_rconfs[i]
                text_files, sync_roots = self._execute_level(
                    i, level_mfccs[i], text_files, sync_roots, aht[i], aba[i])
                self._step_end()

            self._step_begin(u"select levels")
            tree = self._select_levels(tree)
            self._step_end()

            self._step_begin(u"create sync map")
            self.rconf = orig_rconf
            self.task.sync_map = self._create_syncmap(tree)
            self._step_end()

            self._step_begin(u"check zero duration")
            self._check_no_zero(level_rconfs[-1].mws)
            self._step_end()

            self._step_total()
            self.log(u"Executing multi level task... done")
        except Exception as exc:
            self._step_failure(exc)

    def _execute_level(self, level, audio_file_mfcc, text_files, sync_roots,
                       add_head_tail, adjust_boundaries):
        """
        Compute the alignment for all the nodes in the given level.

        Return a pair (next_level_text_files, next_level_sync_roots),
        containing two lists of text file subtrees and sync map subtrees
        on the next level.

        :param int level: the level
        :param audio_file_mfcc: the audio MFCC representation for this level
        :type  audio_file_mfcc: :class:`~aeneas.audiofilemfcc.AudioFileMFCC`
        :param list text_files: a list of :class:`~aeneas.textfile.TextFile` objects,
                                each representing a (sub)tree of the Task text file
        :param list sync_roots: a list of :class:`~aeneas.tree.Tree` objects,
                                each representing a SyncMapFragment tree,
                                one for each element in ``text_files``
        :param bool add_head_tail: if ``True``, add head and tail nodes to the sync map tree
        :param bool adjust_boundaries: if ``True``, execute the adjust boundary algorithm
        :rtype: (list, list)
        """
        self._set_synthesizer()
        next_level_text_files = []
        next_level_sync_roots = []
        for text_file_index, text_file in enumerate(text_files):
            self.log([u"Text level %d, fragment %d", level, text_file_index])
            self.log([u"  Len:   %d", len(text_file)])
            sync_root = sync_roots[text_file_index]
            if (level > 1) and (len(text_file) == 1):
                self.log(
                    u"  Level > 1 and only one child => returning trivial timemap"
                )
                time_map = [(TimeValue("0.000"), sync_root.value.begin),
                            (sync_root.value.begin, sync_root.value.end),
                            (sync_root.value.end, audio_file_mfcc.audio_length)
                            ]
            else:
                self.log(
                    u"  Level 1 or more than one child => computing timemap")
                if not sync_root.is_empty:
                    begin = sync_root.value.begin
                    end = sync_root.value.end
                    self.log([u"  Begin: %.3f", begin])
                    self.log([u"  End:   %.3f", end])
                    audio_file_mfcc.set_head_middle_tail(head_length=begin,
                                                         middle_length=(end -
                                                                        begin))
                else:
                    self.log(u"  No begin or end to set")
                time_map = self._execute_inner(
                    audio_file_mfcc,
                    text_file,
                    adjust_boundaries=adjust_boundaries,
                    log=False)
            self.log([u"  Map:   %s", str(time_map)])
            self._level_time_map_to_tree(text_file,
                                         time_map,
                                         sync_root,
                                         add_head_tail=add_head_tail)
            # store next level roots
            next_level_text_files.extend(text_file.children_not_empty)
            src = sync_root.children
            if add_head_tail:
                # if we added head and tail,
                # we must not pass them to the next level
                src = src[1:-1]
            next_level_sync_roots.extend(src)
        self._clear_cache_synthesizer()
        return (next_level_text_files, next_level_sync_roots)

    def _execute_inner(self,
                       audio_file_mfcc,
                       text_file,
                       adjust_boundaries=True,
                       log=True):
        """
        Align a subinterval of the given AudioFileMFCC
        with the given TextFile.

        Return the computed time map, as a list of intervals.

        The begin and end positions inside the AudioFileMFCC
        must have been set ahead by the caller.

        The text fragments being aligned are the vchildren of ``text_file``.

        :param audio_file_mfcc: the audio file MFCC representation
        :type  audio_file_mfcc: :class:`~aeneas.audiofilemfcc.AudioFileMFCC`
        :param text_file: the text file subtree to align
        :type  text_file: :class:`~aeneas.textfile.TextFile`
        :param bool adjust_boundaries: if ``True``, execute the adjust boundary algorithm
        :param bool log: if ``True``, log steps
        :rtype: list
        """
        self._step_begin(u"synthesize text", log=log)
        synt_handler, synt_path, synt_anchors, synt_format = self._synthesize(
            text_file)
        self._step_end(log=log)

        self._step_begin(u"extract MFCC synt wave", log=log)
        synt_wave_mfcc = self._extract_mfcc(file_path=synt_path,
                                            file_format=synt_format)
        gf.delete_file(synt_handler, synt_path)
        self._step_end(log=log)

        self._step_begin(u"align waves", log=log)
        indices = self._align_waves(audio_file_mfcc, synt_wave_mfcc,
                                    synt_anchors)
        self._step_end(log=log)

        self._step_begin(u"adjust boundaries", log=log)
        time_map = self._adjust_boundaries(audio_file_mfcc, text_file, indices,
                                           adjust_boundaries)
        self._step_end(log=log)

        return time_map

    def _load_audio_file(self):
        """
        Load audio in memory.

        :rtype: :class:`~aeneas.audiofile.AudioFile`
        """
        self._step_begin(u"load audio file")
        # NOTE file_format=None forces conversion to
        #      PCM16 mono WAVE with proper sample rate
        audio_file = AudioFile(file_path=self.task.audio_file_path_absolute,
                               file_format=None,
                               rconf=self.rconf,
                               logger=self.logger)
        audio_file.read_samples_from_file()
        self._step_end()
        return audio_file

    def _clear_audio_file(self, audio_file):
        """
        Clear audio from memory.

        :param audio_file: the object to clear
        :type  audio_file: :class:`~aeneas.audiofile.AudioFile`
        """
        self._step_begin(u"clear audio file")
        audio_file.clear_data()
        audio_file = None
        self._step_end()

    def _extract_mfcc(self, file_path=None, file_format=None, audio_file=None):
        """
        Extract the MFCCs from the given audio file.

        :rtype: :class:`~aeneas.audiofilemfcc.AudioFileMFCC`
        """
        return AudioFileMFCC(file_path=file_path,
                             file_format=file_format,
                             audio_file=audio_file,
                             rconf=self.rconf,
                             logger=self.logger)

    def _compute_head_process_tail(self, audio_file_mfcc):
        """
        Set the audio file head or tail,
        by either reading the explicit values
        from the Task configuration,
        or using SD to determine them.

        This function returns the lengths, in seconds,
        of the (head, process, tail).

        :rtype: tuple (float, float, float)
        """
        head_length = self.task.configuration["i_a_head"]
        process_length = self.task.configuration["i_a_process"]
        tail_length = self.task.configuration["i_a_tail"]
        head_max = self.task.configuration["i_a_head_max"]
        head_min = self.task.configuration["i_a_head_min"]
        tail_max = self.task.configuration["i_a_tail_max"]
        tail_min = self.task.configuration["i_a_tail_min"]
        if ((head_length is not None) or (process_length is not None)
                or (tail_length is not None)):
            self.log(u"Setting explicit head process tail")
        else:
            self.log(u"Detecting head tail...")
            sd = SD(audio_file_mfcc,
                    self.task.text_file,
                    rconf=self.rconf,
                    logger=self.logger)
            head_length = TimeValue("0.000")
            process_length = None
            tail_length = TimeValue("0.000")
            if (head_min is not None) or (head_max is not None):
                self.log(u"Detecting HEAD...")
                head_length = sd.detect_head(head_min, head_max)
                self.log([u"Detected HEAD: %.3f", head_length])
                self.log(u"Detecting HEAD... done")
            if (tail_min is not None) or (tail_max is not None):
                self.log(u"Detecting TAIL...")
                tail_length = sd.detect_tail(tail_min, tail_max)
                self.log([u"Detected TAIL: %.3f", tail_length])
                self.log(u"Detecting TAIL... done")
            self.log(u"Detecting head tail... done")
        self.log([u"Head:    %s", gf.safe_float(head_length, None)])
        self.log([u"Process: %s", gf.safe_float(process_length, None)])
        self.log([u"Tail:    %s", gf.safe_float(tail_length, None)])
        return (head_length, process_length, tail_length)

    def _set_synthesizer(self):
        """ Create synthesizer """
        self.log(u"Setting synthesizer...")
        self.synthesizer = Synthesizer(rconf=self.rconf, logger=self.logger)
        self.log(u"Setting synthesizer... done")

    def _clear_cache_synthesizer(self):
        """ Clear the cache of the synthesizer """
        self.log(u"Clearing synthesizer...")
        self.synthesizer.clear_cache()
        self.log(u"Clearing synthesizer... done")

    def _synthesize(self, text_file):
        """
        Synthesize text into a WAVE file.

        Return a tuple consisting of:

        1. the handler of the generated audio file
        2. the path of the generated audio file
        3. the list of anchors, that is, a list of floats
           each representing the start time of the corresponding
           text fragment in the generated wave file
           ``[start_1, start_2, ..., start_n]``
        4. a tuple describing the format of the audio file

        :param text_file: the text to be synthesized
        :type  text_file: :class:`~aeneas.textfile.TextFile`
        :rtype: tuple (handler, string, list)
        """
        handler, path = gf.tmp_file(
            suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
        result = self.synthesizer.synthesize(text_file, path)
        return (handler, path, result[0], self.synthesizer.output_audio_format)

    def _align_waves(self, real_wave_mfcc, synt_wave_mfcc, synt_anchors):
        """
        Align two AudioFileMFCC objects,
        representing WAVE files.

        Return a list of boundary indices.
        """
        self.log(u"Creating DTWAligner...")
        aligner = DTWAligner(real_wave_mfcc,
                             synt_wave_mfcc,
                             rconf=self.rconf,
                             logger=self.logger)
        self.log(u"Creating DTWAligner... done")
        self.log(u"Computing boundary indices...")
        boundary_indices = aligner.compute_boundaries(synt_anchors)
        self.log(u"Computing boundary indices... done")
        return boundary_indices

    def _adjust_boundaries(self,
                           real_wave_mfcc,
                           text_file,
                           boundary_indices,
                           adjust_boundaries=True):
        """
        Adjust boundaries as requested by the user.

        Return the computed time map, that is,
        a list of pairs ``[start_time, end_time]``,
        of length equal to number of fragments + 2,
        where the two extra elements are for
        the HEAD (first) and TAIL (last).
        """
        # boundary_indices contains the boundary indices in the all_mfcc of real_wave_mfcc
        # starting with the (head-1st fragment) and ending with (-1th fragment-tail)
        if adjust_boundaries:
            aba_algorithm, aba_parameters = self.task.configuration.aba_parameters(
            )
            self.log([u"Running algorithm: '%s'", aba_algorithm])
        else:
            self.log(u"Forced running algorithm: 'auto'")
            aba_algorithm = AdjustBoundaryAlgorithm.AUTO
            aba_parameters = None
        return AdjustBoundaryAlgorithm(algorithm=aba_algorithm,
                                       parameters=aba_parameters,
                                       real_wave_mfcc=real_wave_mfcc,
                                       boundary_indices=boundary_indices,
                                       text_file=text_file,
                                       rconf=self.rconf,
                                       logger=self.logger).to_time_map()

    def _level_time_map_to_tree(self,
                                text_file,
                                time_map,
                                tree=None,
                                add_head_tail=True):
        """
        Convert a level time map into a Tree of SyncMapFragments.

        The time map is
        a list of pairs ``[start_time, end_time]``,
        of length equal to number of fragments + 2,
        where the two extra elements are for
        the HEAD (first) and TAIL (last).

        :param text_file: the text file object
        :type  text_file: :class:`~aeneas.textfile.TextFile`
        :param list time_map: the time map
        :param tree: the tree; if ``None``, a new Tree will be built
        :type  tree: :class:`~aeneas.tree.Tree`
        :rtype: :class:`~aeneas.tree.Tree`
        """
        if tree is None:
            tree = Tree()
        if add_head_tail:
            fragments = ([
                TextFragment(u"HEAD", self.task.configuration["language"],
                             [u""])
            ] + text_file.fragments + [
                TextFragment(u"TAIL", self.task.configuration["language"],
                             [u""])
            ])
            i = 0
        else:
            fragments = text_file.fragments
            i = 1
        for fragment in fragments:
            interval = time_map[i]
            sm_frag = SyncMapFragment(fragment, interval[0], interval[1])
            tree.add_child(Tree(value=sm_frag))
            i += 1
        return tree

    def _select_levels(self, tree):
        """
        Select the correct levels in the tree,
        reading the ``os_task_file_levels``
        parameter in the Task configuration.

        If ``None`` or invalid, return the current sync map tree
        unchanged.
        Otherwise, return only the levels appearing in it.

        :param tree: a Tree of SyncMapFragments
        :type  tree: :class:`~aeneas.tree.Tree`
        :rtype: :class:`~aeneas.tree.Tree`
        """
        levels = self.task.configuration["o_levels"]
        self.log([u"Levels: '%s'", levels])
        if (levels is None) or (len(levels) < 1):
            return tree
        try:
            levels = [int(l) for l in levels if int(l) > 0]
            self.log([u"Converted levels: %s", levels])
        except ValueError:
            self.log_warn(
                u"Cannot convert levels to list of int, returning unchanged")
            return tree
        # remove head and tail nodes
        head = tree.vchildren[0]
        tail = tree.vchildren[-1]
        tree.remove_child(0)
        tree.remove_child(-1)
        # keep only the selected levels
        tree.keep_levels(levels)
        # add head and tail back
        tree.add_child(Tree(value=head), as_last=False)
        tree.add_child(Tree(value=tail), as_last=True)
        # return the new tree
        return tree

    def _create_syncmap(self, tree):
        """
        Return a sync map corresponding to the provided text file and time map.

        :param tree: a Tree of SyncMapFragments
        :type  tree: :class:`~aeneas.tree.Tree`
        :rtype: :class:`~aeneas.syncmap.SyncMap`
        """
        self.log(
            [u"Fragments in time map (including HEAD/TAIL): %d",
             len(tree)])
        head_tail_format = self.task.configuration["o_h_t_format"]
        self.log([u"Head/tail format: %s", str(head_tail_format)])

        children = tree.vchildren
        head = children[0]
        first = children[1]
        last = children[-2]
        tail = children[-1]

        # remove HEAD fragment if needed
        if head_tail_format != SyncMapHeadTailFormat.ADD:
            tree.remove_child(0)
            self.log(u"Removed HEAD")

        # stretch first and last fragment timings if needed
        if head_tail_format == SyncMapHeadTailFormat.STRETCH:
            self.log([
                u"Stretched first.begin: %.3f => %.3f (head)", first.begin,
                head.begin
            ])
            self.log([
                u"Stretched last.end:    %.3f => %.3f (tail)", last.end,
                tail.end
            ])
            first.begin = head.begin
            last.end = tail.end

        # remove TAIL fragment if needed
        if head_tail_format != SyncMapHeadTailFormat.ADD:
            tree.remove_child(-1)
            self.log(u"Removed TAIL")

        # return sync map
        sync_map = SyncMap()
        sync_map.fragments_tree = tree
        return sync_map

    # TODO can this be done during the alignment?
    def _check_no_zero(self, min_mws):
        """ Check for fragments with zero duration """
        if self.task.configuration["o_no_zero"]:
            self.log(u"Checking for fragments with zero duration...")
            delta = TimeValue("0.001")
            leaves = self.task.sync_map.fragments_tree.vleaves_not_empty
            # first and last leaves are HEAD and TAIL, skipping them
            max_index = len(leaves) - 1
            self.log([u"Fragment min index: %d", 1])
            self.log([u"Fragment max index: %d", max_index - 1])
            for i in range(1, max_index):
                self.log([u"Checking index:     %d", i])
                j = i
                while (j < max_index) and (leaves[j].end == leaves[i].begin):
                    j += 1
                if j != i:
                    self.log(u"Fragment(s) with zero duration:")
                    for k in range(i, j):
                        self.log([u"  %d : %s", k, leaves[k]])

                    if leaves[j].end - leaves[j].begin > (j - i) * delta:
                        # there is room after
                        # to move each zero fragment forward by 0.001
                        for k in range(j - i):
                            shift = (k + 1) * delta
                            leaves[i + k].end += shift
                            leaves[i + k + 1].begin += shift
                            self.log([
                                u"  Moved fragment %d forward by %.3f", i + k,
                                shift
                            ])
                    else:
                        self.log_warn(u"  Unable to fix")
                    i = j - 1
            self.log(u"Checking for fragments with zero duration... done")
        else:
            self.log(u"Not checking for fragments with zero duration")