예제 #1
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 1:
            return self.print_help()
        audio_file_path = self.actual_arguments[0]

        try:
            audiofile = AudioFile(audio_file_path,
                                  rconf=self.rconf,
                                  logger=self.logger)
            audiofile.read_properties()
            self.print_generic(audiofile.__unicode__())
            return self.NO_ERROR_EXIT_CODE
        except OSError:
            self.print_error(u"Cannot read file '%s'" % (audio_file_path))
            self.print_error(
                u"Make sure the input file path is written/escaped correctly")
        except AudioFileProbeError:
            self.print_error(u"Unable to call the ffprobe executable '%s'" %
                             (self.rconf[RuntimeConfiguration.FFPROBE_PATH]))
            self.print_error(u"Make sure the path to ffprobe is correct")
        except AudioFileUnsupportedFormatError:
            self.print_error(u"Cannot read properties of file '%s'" %
                             (audio_file_path))
            self.print_error(
                u"Make sure the input file has a format supported by ffprobe")

        return self.ERROR_EXIT_CODE
예제 #2
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 1:
            return self.print_help()
        audio_file_path = self.actual_arguments[0]

        try:
            audiofile = AudioFile(audio_file_path, rconf=self.rconf, logger=self.logger)
            audiofile.read_properties()
            self.print_generic(audiofile.__unicode__())
            return self.NO_ERROR_EXIT_CODE
        except OSError:
            self.print_error(u"Cannot read file '%s'" % (audio_file_path))
            self.print_error(u"Make sure the input file path is written/escaped correctly")
        except AudioFileProbeError:
            self.print_error(u"Unable to call the ffprobe executable '%s'" % (self.rconf[RuntimeConfiguration.FFPROBE_PATH]))
            self.print_error(u"Make sure the path to ffprobe is correct")
        except AudioFileUnsupportedFormatError:
            self.print_error(u"Cannot read properties of file '%s'" % (audio_file_path))
            self.print_error(u"Make sure the input file has a format supported by ffprobe")

        return self.ERROR_EXIT_CODE
예제 #3
0
    def _read_audio_data(self, file_path):
        """
        Read audio data from file.

        :rtype: tuple (True, (duration, sample_rate, codec, data)) or (False, None) on exception
        """
        try:
            self.log(u"Reading audio data...")
            # if we know the TTS outputs to PCM16 mono WAVE
            # with the correct sample rate,
            # we can read samples directly from it,
            # without an intermediate conversion through ffmpeg
            audio_file = AudioFile(file_path=file_path,
                                   file_format=self.OUTPUT_AUDIO_FORMAT,
                                   rconf=self.rconf,
                                   logger=self.logger)
            audio_file.read_samples_from_file()
            self.log(
                [u"Duration of '%s': %f", file_path, audio_file.audio_length])
            self.log(u"Reading audio data... done")
            return (True,
                    (audio_file.audio_length, audio_file.audio_sample_rate,
                     audio_file.audio_format, audio_file.audio_samples))
        except (AudioFileUnsupportedFormatError, OSError) as exc:
            self.log_exc(
                u"An unexpected error occurred while reading audio data", exc,
                True, None)
            return (False, None)
예제 #4
0
 def load(self, path, rp=False, rs=False):
     af = AudioFile(gf.absolute_path(path, __file__))
     if rp:
         af.read_properties()
     if rs:
         af.read_samples_from_file()
     return af
예제 #5
0
 def test_load_audio_file(self):
     af = AudioFile(gf.absolute_path(self.AUDIO_FILE_WAVE, __file__))
     af.read_samples_from_file()
     audiofile = AudioFileMFCC(audio_file=af)
     self.assertIsNotNone(audiofile.all_mfcc)
     self.assertAlmostEqual(audiofile.audio_length,
                            TimeValue("53.3"),
                            places=1)  # 53.266
예제 #6
0
 def _populate_audio_file(self):
     """
     Create the ``self.audio_file`` object by reading
     the audio file at ``self.audio_file_path_absolute``.
     """
     if self.audio_file_path_absolute is not None:
         self.audio_file = AudioFile(
             file_path=self.audio_file_path_absolute, logger=None)
         self.audio_file.read_properties()
예제 #7
0
def main():
    """ Entry point """
    if len(sys.argv) < 2:
        usage()
        return
    file_path = sys.argv[1]
    audiofile = AudioFile(file_path)
    audiofile.read_properties()
    print str(audiofile)
예제 #8
0
def main():
    """ Entry point """
    if len(sys.argv) < 2:
        usage()
        return
    file_path = sys.argv[1]
    audiofile = AudioFile(file_path)
    audiofile.read_properties()
    print str(audiofile)
예제 #9
0
 def test_add_samples_reverse_memory(self):
     audiofile = AudioFile()
     audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]), reverse=True)
     audiofile.add_samples(numpy.array([6, 7, 8, 9, 10]), reverse=True)
     self.assertEqual(len(audiofile.audio_samples), 10)
     self.assertEqual(audiofile.audio_samples[0], 5)
     self.assertEqual(audiofile.audio_samples[1], 4)
     self.assertEqual(audiofile.audio_samples[4], 1)
     self.assertEqual(audiofile.audio_samples[5], 10)
     self.assertEqual(audiofile.audio_samples[6], 9)
     self.assertEqual(audiofile.audio_samples[9], 6)
예제 #10
0
 def test_add_samples_memory(self):
     audiofile = AudioFile()
     audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]))
     audiofile.add_samples(numpy.array([6, 7, 8, 9, 10]))
     self.assertEqual(len(audiofile.audio_samples), 10)
     self.assertEqual(audiofile.audio_samples[0], 1)
     self.assertEqual(audiofile.audio_samples[1], 2)
     self.assertEqual(audiofile.audio_samples[4], 5)
     self.assertEqual(audiofile.audio_samples[5], 6)
     self.assertEqual(audiofile.audio_samples[6], 7)
     self.assertEqual(audiofile.audio_samples[9], 10)
예제 #11
0
 def __init__(
         self,
         file_path=None,
         file_path_is_mono_wave=False,
         mfcc_matrix=None,
         audio_file=None,
         rconf=None,
         logger=None
 ):
     if (file_path is None) and (audio_file is None) and (mfcc_matrix is None):
         raise ValueError(u"You must initialize with at least one of: file_path, audio_file, or mfcc_matrix")
     super(AudioFileMFCC, self).__init__(rconf=rconf, logger=logger)
     self.file_path = file_path
     self.audio_file = audio_file
     self.is_reversed = False
     self.__mfcc = None
     self.__mfcc_mask = None
     self.__mfcc_mask_map = None
     self.__speech_intervals = None
     self.__nonspeech_intervals = None
     self.log(u"Initializing MFCCs...")
     if mfcc_matrix is not None:
         self.__mfcc = mfcc_matrix
         self.audio_length = self.all_length * self.rconf.mws
     elif (self.file_path is not None) or (self.audio_file is not None):
         audio_file_was_none = False
         if self.audio_file is None:
             audio_file_was_none = True
             self.audio_file = AudioFile(
                 self.file_path,
                 is_mono_wave=file_path_is_mono_wave,
                 rconf=self.rconf,
                 logger=self.logger
             )
             # NOTE load audio samples into memory, if not present already
             self.audio_file.audio_samples
         gf.run_c_extension_with_fallback(
             self.log,
             "cmfcc",
             self._compute_mfcc_c_extension,
             self._compute_mfcc_pure_python,
             (),
             c_extension=self.rconf[RuntimeConfiguration.C_EXTENSIONS]
         )
         self.audio_length = self.audio_file.audio_length
         if audio_file_was_none:
             self.log(u"Clearing the audio data...")
             self.audio_file.clear_data()
             self.audio_file = None
             self.log(u"Clearing the audio data... done")
     self.__middle_begin = 0
     self.__middle_end = self.__mfcc.shape[1]
     self.log(u"Initializing MFCCs... done")
예제 #12
0
 def test_compute_mfcc(self):
     try:
         import aeneas.cmfcc.cmfcc
         audio_file = AudioFile(self.AUDIO)
         audio_file.read_samples_from_file()
         mfcc_c = (aeneas.cmfcc.cmfcc.compute_from_data(
             audio_file.audio_samples, audio_file.audio_sample_rate, 40, 13,
             512, 133.3333, 6855.4976, 0.97, 0.025, 0.010)[0]).transpose()
         self.assertEqual(mfcc_c.shape[0], 13)
         self.assertGreater(mfcc_c.shape[1], 0)
     except ImportError:
         pass
예제 #13
0
    def _load_audio_file(self):
        """
        Load audio in memory.

        :rtype: :class:`~aeneas.audiofile.AudioFile`
        """
        self._step_begin(u"load audio file")
        audio_file = AudioFile(
            file_path=self.task.audio_file_path_absolute, is_mono_wave=False, rconf=self.rconf, logger=self.logger
        )
        audio_file.read_samples_from_file()
        self._step_end()
        return audio_file
예제 #14
0
    def compute_mfcc(self):
        """
        Compute the MFCCs of the two waves,
        and store them internally.
        """
        if (
                (self.real_wave_path is not None) and
                (os.path.isfile(self.real_wave_path))
            ):
            self._log("Computing MFCCs for real wave...")
            wave = AudioFile(self.real_wave_path, logger=self.logger)
            wave.extract_mfcc(self.frame_rate)
            self.real_wave_full_mfcc = wave.audio_mfcc
            self.real_wave_length = wave.audio_length
            self._log("Computing MFCCs for real wave... done")
        else:
            self._log(["Input file '%s' cannot be read", self.real_wave_path], Logger.CRITICAL)
            raise OSError("Input file cannot be read")

        if (
                (self.synt_wave_path is not None) and
                (os.path.isfile(self.synt_wave_path))
            ):
            self._log("Computing MFCCs for synt wave...")
            wave = AudioFile(self.synt_wave_path, logger=self.logger)
            wave.extract_mfcc(self.frame_rate)
            self.synt_wave_full_mfcc = wave.audio_mfcc
            self.synt_wave_length = wave.audio_length
            self._log("Computing MFCCs for synt wave... done")
        else:
            self._log(["Input file '%s' cannot be read", self.synt_wave_path], Logger.CRITICAL)
            raise OSError("Input file cannot be read")
예제 #15
0
    def _load_audio_file(self):
        """
        Load audio in memory.

        :rtype: :class:`~aeneas.audiofile.AudioFile`
        """
        self._step_begin(u"load audio file")
        audio_file = AudioFile(file_path=self.task.audio_file_path_absolute,
                               is_mono_wave=False,
                               rconf=self.rconf,
                               logger=self.logger)
        audio_file.read_samples_from_file()
        self._step_end()
        return audio_file
예제 #16
0
 def _extract_mfcc(self, audio_file_path):
     """
     Extract the MFCCs of the real full wave.
     """
     self._log("Extracting MFCCs from real full wave")
     try:
         audio_file = AudioFile(audio_file_path, logger=self.logger)
         audio_file.extract_mfcc()
         self._log("Extracting MFCCs from real full wave: succeeded")
         return (True, audio_file.audio_mfcc, audio_file.audio_length)
     except Exception as e:
         self._log("Extracting MFCCs from real full wave: failed")
         self._log(["Message: %s", str(e)])
         return (False, None, None)
예제 #17
0
 def _extract_mfcc(self, audio_file_path):
     """
     Extract the MFCCs of the real full wave.
     """
     self._log("Extracting MFCCs from real full wave")
     try:
         audio_file = AudioFile(audio_file_path, logger=self.logger)
         audio_file.extract_mfcc()
         self._log("Extracting MFCCs from real full wave: succeeded")
         return (True, audio_file.audio_mfcc, audio_file.audio_length)
     except Exception as e:
         self._log("Extracting MFCCs from real full wave: failed")
         self._log(["Message: %s", str(e)])
         return (False, None, None)
예제 #18
0
 def test_preallocate_smaller(self):
     audiofile = AudioFile()
     audiofile.preallocate_memory(100)
     self.assertEqual(len(audiofile.audio_samples), 0)
     audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]))
     self.assertEqual(len(audiofile.audio_samples), 5)
     audiofile.preallocate_memory(2)
     self.assertEqual(len(audiofile.audio_samples), 2)
예제 #19
0
    def _load_audio_file(self):
        """
        Load audio in memory.

        :rtype: :class:`~aeneas.audiofile.AudioFile`
        """
        self._step_begin(u"load audio file")
        # NOTE file_format=None forces conversion to
        #      PCM16 mono WAVE with default sample rate
        audio_file = AudioFile(file_path=self.task.audio_file_path_absolute,
                               file_format=None,
                               rconf=self.rconf,
                               logger=self.logger)
        audio_file.read_samples_from_file()
        self._step_end()
        return audio_file
예제 #20
0
파일: task.py 프로젝트: shivupoojar/DeFog
 def _populate_audio_file(self):
     """
     Create the ``self.audio_file`` object by reading
     the audio file at ``self.audio_file_path_absolute``.
     """
     self.log(u"Populate audio file...")
     if self.audio_file_path_absolute is not None:
         self.log([
             u"audio_file_path_absolute is '%s'",
             self.audio_file_path_absolute
         ])
         self.audio_file = AudioFile(
             file_path=self.audio_file_path_absolute, logger=self.logger)
         self.audio_file.read_properties()
     else:
         self.log(u"audio_file_path_absolute is None")
     self.log(u"Populate audio file... done")
예제 #21
0
    def _load_audio_file(self):
        """
        Load audio in memory.

        :rtype: :class:`~aeneas.audiofile.AudioFile`
        """
        self._step_begin(u"load audio file")
        # NOTE file_format=None forces conversion to
        #      PCM16 mono WAVE with default sample rate
        audio_file = AudioFile(
            file_path=self.task.audio_file_path_absolute,
            file_format=None,
            rconf=self.rconf,
            logger=self.logger
        )
        audio_file.read_samples_from_file()
        self._step_end()
        return audio_file
예제 #22
0
 def test_preallocate_smaller(self):
     audiofile = AudioFile()
     audiofile.preallocate_memory(100)
     self.assertEqual(len(audiofile.audio_samples), 0)
     audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]))
     self.assertEqual(len(audiofile.audio_samples), 5)
     audiofile.preallocate_memory(2)
     self.assertEqual(len(audiofile.audio_samples), 2)
예제 #23
0
 def compute_mfcc(self):
     """
     Compute the MFCCs of the wave,
     and store them internally.
     """
     if (self.wave_path is not None) and (os.path.isfile(self.wave_path)):
         self._log("Computing MFCCs for wave...")
         try:
             wave = AudioFile(self.wave_path, logger=self.logger)
             wave.extract_mfcc(self.frame_rate)
             self.wave_mfcc = wave.audio_mfcc
             self.wave_len = wave.audio_length
         except IOError as e:
             self._log("IOError", Logger.CRITICAL)
             self._log(["Message: %s", e])
             raise e
         self._log("Computing MFCCs for wave... done")
     else:
         self._log(["Input file '%s' cannot be read", self.wave_path], Logger.CRITICAL)
         raise OSError("Input file cannot be read")
예제 #24
0
 def _populate_audio_file(self):
     """
     Create the ``self.audio_file`` object by reading
     the audio file at ``self.audio_file_path_absolute``.
     """
     if self.audio_file_path_absolute is not None:
         self.audio_file = AudioFile(
             file_path=self.audio_file_path_absolute,
             logger=None
         )
         self.audio_file.read_properties()
예제 #25
0
 def __init__(
         self,
         file_path=None,
         file_path_is_mono_wave=False,
         mfcc_matrix=None,
         audio_file=None,
         rconf=None,
         logger=None
 ):
     if (file_path is None) and (audio_file is None) and (mfcc_matrix is None):
         raise ValueError(u"You must initialize with at least one of: file_path, audio_file, or mfcc_matrix")
     super(AudioFileMFCC, self).__init__(rconf=rconf, logger=logger)
     self.file_path = file_path
     self.audio_file = audio_file
     self.is_reversed = False
     self.__mfcc = None
     self.__mfcc_mask = None
     self.__mfcc_mask_map = None
     self.__speech_intervals = None
     self.__nonspeech_intervals = None
     self.log(u"Initializing MFCCs...")
     if mfcc_matrix is not None:
         self.__mfcc = mfcc_matrix
         self.audio_length = self.all_length * self.rconf.mws
     elif (self.file_path is not None) or (self.audio_file is not None):
         audio_file_was_none = False
         if self.audio_file is None:
             audio_file_was_none = True
             self.audio_file = AudioFile(
                 self.file_path,
                 is_mono_wave=file_path_is_mono_wave,
                 rconf=self.rconf,
                 logger=self.logger
             )
             # NOTE load audio samples into memory, if not present already
             self.audio_file.audio_samples
         gf.run_c_extension_with_fallback(
             self.log,
             "cmfcc",
             self._compute_mfcc_c_extension,
             self._compute_mfcc_pure_python,
             (),
             rconf=self.rconf
         )
         self.audio_length = self.audio_file.audio_length
         if audio_file_was_none:
             self.log(u"Clearing the audio data...")
             self.audio_file.clear_data()
             self.audio_file = None
             self.log(u"Clearing the audio data... done")
     self.__middle_begin = 0
     self.__middle_end = self.__mfcc.shape[1]
     self.log(u"Initializing MFCCs... done")
예제 #26
0
파일: vad.py 프로젝트: fduch2k/aeneas
 def compute_mfcc(self):
     """
     Compute the MFCCs of the wave,
     and store them internally.
     """
     if (self.wave_path is not None) and (os.path.isfile(self.wave_path)):
         self._log("Computing MFCCs for wave...")
         try:
             wave = AudioFile(self.wave_path, logger=self.logger)
             wave.extract_mfcc(self.frame_rate)
             self.wave_mfcc = wave.audio_mfcc
             self.wave_len = wave.audio_length
         except IOError as e:
             self._log("IOError", Logger.CRITICAL)
             self._log(["Message: %s", e])
             raise e
         self._log("Computing MFCCs for wave... done")
     else:
         self._log(["Input file '%s' cannot be read", self.wave_path],
                   Logger.CRITICAL)
         raise OSError("Input file cannot be read")
예제 #27
0
 def test_compute_mfcc(self):
     try:
         import aeneas.cmfcc.cmfcc
         audio_file = AudioFile(self.AUDIO)
         audio_file.read_samples_from_file()
         mfcc_c = (aeneas.cmfcc.cmfcc.compute_from_data(
             audio_file.audio_samples,
             audio_file.audio_sample_rate,
             40,
             13,
             512,
             133.3333,
             6855.4976,
             0.97,
             0.025,
             0.010
         )[0]).transpose()
         self.assertEqual(mfcc_c.shape[0], 13)
         self.assertGreater(mfcc_c.shape[1], 0)
     except ImportError:
         pass
예제 #28
0
 def load(self, path, rp=False, rs=False):
     af = AudioFile(gf.absolute_path(path, __file__))
     if rp:
         af.read_properties()
     if rs:
         af.read_samples_from_file()
     return af
예제 #29
0
파일: task.py 프로젝트: ptrwtts/aeneas
 def _populate_audio_file(self):
     """
     Create the ``self.audio_file`` object by reading
     the audio file at ``self.audio_file_path_absolute``.
     """
     self.log(u"Populate audio file...")
     if self.audio_file_path_absolute is not None:
         self.log([u"audio_file_path_absolute is '%s'", self.audio_file_path_absolute])
         self.audio_file = AudioFile(
             file_path=self.audio_file_path_absolute,
             logger=self.logger
         )
         self.audio_file.read_properties()
     else:
         self.log(u"audio_file_path_absolute is None")
     self.log(u"Populate audio file... done")
예제 #30
0
 def test_add_samples_reverse_memory(self):
     audiofile = AudioFile()
     audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]), reverse=True)
     audiofile.add_samples(numpy.array([6, 7, 8, 9, 10]), reverse=True)
     self.assertEqual(len(audiofile.audio_samples), 10)
     self.assertEqual(audiofile.audio_samples[0], 5)
     self.assertEqual(audiofile.audio_samples[1], 4)
     self.assertEqual(audiofile.audio_samples[4], 1)
     self.assertEqual(audiofile.audio_samples[5], 10)
     self.assertEqual(audiofile.audio_samples[6], 9)
     self.assertEqual(audiofile.audio_samples[9], 6)
예제 #31
0
 def test_add_samples_memory(self):
     audiofile = AudioFile()
     audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]))
     audiofile.add_samples(numpy.array([6, 7, 8, 9, 10]))
     self.assertEqual(len(audiofile.audio_samples), 10)
     self.assertEqual(audiofile.audio_samples[0], 1)
     self.assertEqual(audiofile.audio_samples[1], 2)
     self.assertEqual(audiofile.audio_samples[4], 5)
     self.assertEqual(audiofile.audio_samples[5], 6)
     self.assertEqual(audiofile.audio_samples[6], 7)
     self.assertEqual(audiofile.audio_samples[9], 10)
예제 #32
0
def main():
    """ Entry point """
    if len(sys.argv) < 3:
        usage()
        return
    file_path = sys.argv[1]
    save_path = sys.argv[2]

    if not gf.can_run_c_extension():
        print "[WARN] Unable to load Python C Extensions"
        print "[WARN] Running the slower pure Python code"
        print "[WARN] See the README file for directions to compile the Python C Extensions"

    audiofile = AudioFile(file_path)
    audiofile.load_data()
    audiofile.extract_mfcc()
    audiofile.clear_data()
    numpy.savetxt(save_path, audiofile.audio_mfcc)
    print "[INFO] MFCCs saved to %s" % (save_path)
예제 #33
0
파일: sd.py 프로젝트: cambell-prince/aeneas
    def _detect_start(self, min_start_length, max_start_length, metric, backwards=False):
        """ Detect start """

        self._log(["Min start length: %.3f", min_start_length])
        self._log(["Max start length: %.3f", max_start_length])
        self._log(["Metric:           %s", metric])
        self._log(["Backwards:        %s", str(backwards)])

        audio_rate = self.text_file.characters / self.audio_file.audio_length
        self._log(["Audio rate:     %.3f", audio_rate])

        self._log("Synthesizing query...")
        tmp_handler, tmp_file_path = tempfile.mkstemp(
            suffix=".wav",
            dir=gf.custom_tmp_dir()
        )
        synt = Synthesizer(logger=self.logger)
        synt_duration = max_start_length * self.QUERY_FACTOR
        self._log(["Synthesizing %.3f seconds", synt_duration])
        result = synt.synthesize(
            self.text_file,
            tmp_file_path,
            quit_after=synt_duration,
            backwards=backwards
        )
        self._log("Synthesizing query... done")

        query_file = AudioFile(tmp_file_path)
        if backwards:
            self._log("Reversing query")
            query_file.reverse()
        self._log("Extracting MFCCs for query...")
        query_file.extract_mfcc(frame_rate=self.frame_rate)
        query_file.clear_data()
        self._log("Extracting MFCCs for query... done")

        self._log("Cleaning up...")
        self._cleanup(tmp_handler, tmp_file_path)
        self._log("Cleaning up... done")

        query_characters = result[2]
        query_len = query_file.audio_length
        query_mfcc = query_file.audio_mfcc
        query_rate = query_characters / query_len

        stretch_factor = max(1, query_rate / audio_rate)
        self._log(["Audio rate:     %.3f", audio_rate])
        self._log(["Query rate:     %.3f", query_rate])
        self._log(["Stretch factor: %.3f", stretch_factor])

        audio_mfcc = self.audio_file.audio_mfcc
        self._log(["Actual audio has %d frames", audio_mfcc.shape[1]])
        audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate)
        self._log(["Limiting audio to first %d frames", audio_mfcc_end_index])
        audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1])
        audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index]
        self._log(["Limited audio has %d frames", audio_mfcc.shape[1]])

        l, o = audio_mfcc.shape
        l, n = query_mfcc.shape

        # minimum length of a matched interval in the real audio
        stretched_match_minimum_length = int(n * stretch_factor)

        self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)])
        self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)])
        self._log(["Stretch factor:          %.3f", stretch_factor])
        self._log(["Required minimum length: %.3f", stretched_match_minimum_length])
        self._log("Speech intervals:")
        for interval in self.audio_speech:
            self._log(["  %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]])

        admissible_intervals = [x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length))]
        self._log("AdmissibleSpeech intervals:")
        for interval in admissible_intervals:
            self._log(["  %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]])

        candidates = []
        runs_with_min_length = 0
        runs_no_improvement = 0
        runs_min_distortion = numpy.inf
        runs_min_value = numpy.inf

        for interval in admissible_intervals:
            if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT:
                self._log("  Breaking: too many runs without improvement")
                break

            if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH:
                self._log("  Breaking: too many runs with minimum required length")
                break

            start_time = interval[0]
            start_index = self._t2i(start_time)
            self._log(["Evaluating interval starting at %d == %.3f ", start_index, start_time])
            if start_index > o:
                self._log("  Breaking: start index outside audio window")
                break

            req_end_index = start_index + stretched_match_minimum_length
            req_end_time = self._i2t(req_end_index)
            if req_end_index > o:
                self._log("  Breaking: not enough audio left in shifted window")
                break
            end_index = min(start_index + 2 * n, o)
            end_time = self._i2t(end_index)

            self._log(["  Start   %d == %.3f", start_index, start_time])
            self._log(["  Req end %d == %.3f", req_end_index, req_end_time])
            self._log(["  Eff end %d == %.3f", end_index, end_time])

            audio_mfcc_sub = audio_mfcc[:, start_index:end_index]
            l, m = audio_mfcc_sub.shape

            self._log("Computing DTW...")
            aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger)
            aligner.real_wave_full_mfcc = audio_mfcc_sub
            aligner.synt_wave_full_mfcc = query_mfcc
            aligner.real_wave_length = self._i2t(m)
            aligner.synt_wave_length = self._i2t(n)
            acm = aligner.compute_accumulated_cost_matrix()
            # transpose, so we have an n x m accumulated cost matrix
            acm = acm.transpose()
            last_row = acm[-1, :]
            self._log("Computing DTW... done")

            # find the minimum, but its index must be >= stretched_match_minimum_length
            candidate_argmin_index = numpy.argmin(last_row[stretched_match_minimum_length:])
            candidate_length_index = stretched_match_minimum_length + candidate_argmin_index
            candidate_length_time = self._i2t(candidate_length_index)
            candidate_value = last_row[candidate_length_index]
            candidate_end_index = start_index + candidate_length_index
            candidate_end_time = self._i2t(candidate_end_index)
            candidate_distortion = candidate_value / candidate_length_index

            # check if the candidate has minimum length
            if candidate_length_index == stretched_match_minimum_length:
                runs_with_min_length += 1
            else:
                runs_with_min_length = 0

            # check if the candidate improved the global minimum value
            if metric == SDMetric.VALUE:
                if candidate_value < runs_min_value:
                    runs_min_value = candidate_value
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1
            if metric == SDMetric.DISTORTION:
                if candidate_distortion < runs_min_distortion:
                    runs_min_distortion = candidate_distortion
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1

            # append to the list of candidates
            self._log(["    Interval  start:      %d == %.6f", start_index, start_time])
            self._log(["    Interval  end:        %d == %.6f", end_index, end_time])
            self._log(["    Candidate start:      %d == %.6f", start_index, start_time])
            self._log(["    Candidate end:        %d == %.6f", candidate_end_index, candidate_end_time])
            self._log(["    Candidate length:     %d == %.6f", candidate_length_index, candidate_length_time])
            self._log(["    Candidate value:      %.6f", candidate_value])
            self._log(["    Candidate distortion: %.6f", candidate_distortion])
            candidates.append({
                "start_index": start_index,
                "length": candidate_length_index,
                "value": candidate_value,
                "distortion": candidate_distortion
            })

        # select best candidate and return its start time
        # if we have no best candidate, return 0.0
        best_candidate = self._select_best_candidate(candidates, metric)
        if best_candidate is None:
            return 0.0
        sd_time = self._i2t(max(best_candidate["start_index"], 0))
        self._log(["Returning time %.3f", sd_time])
        return sd_time
예제 #34
0
 def test_preallocate(self):
     audiofile = AudioFile()
     with self.assertRaises(AudioFileNotInitializedError):
         audiofile.audio_samples
     audiofile.preallocate_memory(100)
     self.assertEqual(len(audiofile.audio_samples), 0)
예제 #35
0
 def test_create_none(self):
     audiofile = AudioFile()
예제 #36
0
class AudioFileMFCC(Loggable):
    """
    A monoaural (single channel) WAVE audio file,
    represented as a NumPy 2D matrix of
    Mel-frequency ceptral coefficients (MFCC).

    The matrix is "fat", that is,
    its number of rows is equal to the number of MFCC coefficients
    and its number of columns is equal to the number of window shifts
    in the audio file.
    The number of MFCC coefficients and the MFCC window shift can
    be modified via the
    :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_SIZE`
    and
    :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_WINDOW_SHIFT`
    keys in the ``rconf`` object.

    If ``mfcc_matrix`` is not ``None``,
    it will be used as the MFCC matrix.

    If ``file_path`` or ``audio_file`` is not ``None``,
    the MFCCs will be computed upon creation of the object,
    possibly converting to PCM16 Mono WAVE and/or
    loading audio data in memory.

    The MFCCs for the entire wave
    are divided into three
    contiguous intervals (possibly, zero-length)::

        HEAD   = [:middle_begin[
        MIDDLE = [middle_begin:middle_end[
        TAIL   = [middle_end:[

    The usual NumPy convention of including the left/start index
    and excluding the right/end index is adopted.

    For alignment purposes, only the ``MIDDLE`` portion of the wave
    is taken into account; the ``HEAD`` and ``TAIL`` intervals are ignored.

    This class heavily uses NumPy views and in-place operations
    to avoid creating temporary data or copying data around.

    :param string file_path: the path of the PCM16 mono WAVE file, or ``None``
    :param bool file_path_is_mono_wave: set to ``True`` if the audio file at ``file_path`` is a PCM16 mono WAVE file
    :param mfcc_matrix: the MFCC matrix to be set, or ``None``
    :type  mfcc_matrix: :class:`numpy.ndarray`
    :param audio_file: an audio file, or ``None``
    :type  audio_file: :class:`~aeneas.audiofile.AudioFile`
    :param rconf: a runtime configuration
    :type  rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration`
    :param logger: the logger object
    :type  logger: :class:`~aeneas.logger.Logger`
    :raises: ValueError: if ``file_path``, ``audio_file``, and ``mfcc_matrix`` are all ``None``

    .. versionadded:: 1.5.0
    """

    TAG = u"AudioFileMFCC"

    def __init__(
            self,
            file_path=None,
            file_path_is_mono_wave=False,
            mfcc_matrix=None,
            audio_file=None,
            rconf=None,
            logger=None
    ):
        if (file_path is None) and (audio_file is None) and (mfcc_matrix is None):
            raise ValueError(u"You must initialize with at least one of: file_path, audio_file, or mfcc_matrix")
        super(AudioFileMFCC, self).__init__(rconf=rconf, logger=logger)
        self.file_path = file_path
        self.audio_file = audio_file
        self.is_reversed = False
        self.__mfcc = None
        self.__mfcc_mask = None
        self.__mfcc_mask_map = None
        self.__speech_intervals = None
        self.__nonspeech_intervals = None
        self.log(u"Initializing MFCCs...")
        if mfcc_matrix is not None:
            self.__mfcc = mfcc_matrix
            self.audio_length = self.all_length * self.rconf.mws
        elif (self.file_path is not None) or (self.audio_file is not None):
            audio_file_was_none = False
            if self.audio_file is None:
                audio_file_was_none = True
                self.audio_file = AudioFile(
                    self.file_path,
                    is_mono_wave=file_path_is_mono_wave,
                    rconf=self.rconf,
                    logger=self.logger
                )
                # NOTE load audio samples into memory, if not present already
                self.audio_file.audio_samples
            gf.run_c_extension_with_fallback(
                self.log,
                "cmfcc",
                self._compute_mfcc_c_extension,
                self._compute_mfcc_pure_python,
                (),
                rconf=self.rconf
            )
            self.audio_length = self.audio_file.audio_length
            if audio_file_was_none:
                self.log(u"Clearing the audio data...")
                self.audio_file.clear_data()
                self.audio_file = None
                self.log(u"Clearing the audio data... done")
        self.__middle_begin = 0
        self.__middle_end = self.__mfcc.shape[1]
        self.log(u"Initializing MFCCs... done")

    def __unicode__(self):
        msg = [
            u"File path:        %s" % self.file_path,
            u"Audio length (s): %s" % gf.safe_float(self.audio_length),
        ]
        return u"\n".join(msg)

    def __str__(self):
        return gf.safe_str(self.__unicode__())

    @property
    def all_mfcc(self):
        """
        The MFCCs of the entire audio file,
        that is, HEAD + MIDDLE + TAIL.

        :rtype: :class:`numpy.ndarray` (2D)
        """
        return self.__mfcc

    @property
    def all_length(self):
        """
        The length, in MFCC coefficients,
        of the entire audio file,
        that is, HEAD + MIDDLE + TAIL.

        :rtype: int
        """
        return self.__mfcc.shape[1]

    @property
    def middle_mfcc(self):
        """
        The MFCCs of the middle part of the audio file,
        that is, without HEAD and TAIL.

        :rtype: :class:`numpy.ndarray` (2D)
        """
        return self.__mfcc[:, self.__middle_begin:self.__middle_end]

    @property
    def middle_length(self):
        """
        The length, in MFCC coefficients,
        of the middle part of the audio file,
        that is, without HEAD and TAIL.

        :rtype: int
        """
        return self.__middle_end - self.__middle_begin

    @property
    def middle_map(self):
        """
        Return the map
        from the MFCC frame indices
        in the MIDDLE portion of the wave
        to the MFCC FULL frame indices,
        that is, an ``numpy.arange(self.middle_begin, self.middle_end)``.

        NOTE: to translate indices of MIDDLE,
        instead of using fancy indexing with the
        result of this function, you might want to simply
        add ``self.head_length``.
        This function is provided mostly for consistency
        with the MASKED case.

        :rtype: :class:`numpy.ndarray` (1D)
        """
        return numpy.arange(self.__middle_begin, self.__middle_end)

    @property
    def head_length(self):
        """
        The length, in MFCC coefficients,
        of the HEAD of the audio file.

        :rtype: int
        """
        return self.__middle_begin

    @property
    def tail_length(self):
        """
        The length, in MFCC coefficients,
        of the TAIL of the audio file.

        :rtype: int
        """
        return self.all_length - self.__middle_end

    @property
    def tail_begin(self):
        """
        The index, in MFCC coefficients,
        where the TAIL of the audio file starts.

        :rtype: int
        """
        return self.__middle_end

    @property
    def audio_length(self):
        """
        The length, in seconds, of the audio file.

        This value is the actual length of the audio file,
        computed as ``number of samples / sample_rate``,
        hence it might differ than ``len(self.__mfcc) * mfcc_window_shift``.

        :rtype: :class:`~aeneas.timevalue.TimeValue`
        """
        return self.__audio_length
    @audio_length.setter
    def audio_length(self, audio_length):
        self.__audio_length = audio_length

    @property
    def is_reversed(self):
        """
        Return ``True`` if currently reversed.

        :rtype: bool
        """
        return self.__is_reversed
    @is_reversed.setter
    def is_reversed(self, is_reversed):
        self.__is_reversed = is_reversed

    @property
    def masked_mfcc(self):
        """
        Return the MFCC speech frames
        in the FULL wave.

        :rtype: :class:`numpy.ndarray` (2D)
        """
        self._ensure_mfcc_mask()
        return self.__mfcc[:, self.__mfcc_mask]

    @property
    def masked_length(self):
        """
        Return the number of MFCC speech frames
        in the FULL wave.

        :rtype: int
        """
        self._ensure_mfcc_mask()
        return len(self.__mfcc_mask_map)

    @property
    def masked_map(self):
        """
        Return the map
        from the MFCC speech frame indices
        to the MFCC FULL frame indices.

        :rtype: :class:`numpy.ndarray` (1D)
        """
        self._ensure_mfcc_mask()
        return self.__mfcc_mask_map

    @property
    def masked_middle_mfcc(self):
        """
        Return the MFCC speech frames
        in the MIDDLE portion of the wave.

        :rtype: :class:`numpy.ndarray` (2D)
        """
        begin, end = self._masked_middle_begin_end()
        return (self.masked_mfcc)[:, begin:end]

    @property
    def masked_middle_length(self):
        """
        Return the number of MFCC speech frames
        in the MIDDLE portion of the wave.

        :rtype: int
        """
        begin, end = self._masked_middle_begin_end()
        return end - begin

    @property
    def masked_middle_map(self):
        """
        Return the map
        from the MFCC speech frame indices
        in the MIDDLE portion of the wave
        to the MFCC FULL frame indices.

        :rtype: :class:`numpy.ndarray` (1D)
        """
        begin, end = self._masked_middle_begin_end()
        return self.__mfcc_mask_map[begin:end]

    def _masked_middle_begin_end(self):
        """
        Return the begin and end indices w.r.t. ``self.__mfcc_mask_map``,
        corresponding to indices in the MIDDLE portion of the wave,
        that is, which fall between ``self.__middle_begin`` and
        ``self.__middle_end`` in ``self.__mfcc``.

        :rtype: (int, int)
        """
        self._ensure_mfcc_mask()
        begin = numpy.searchsorted(self.__mfcc_mask_map, self.__middle_begin, side="left")
        end = numpy.searchsorted(self.__mfcc_mask_map, self.__middle_end, side="right")
        return (begin, end)

    def intervals(self, speech=True, time=True):
        """
        Return a list of intervals::

        [(b_1, e_1), (b_2, e_2), ..., (b_k, e_k)]

        where ``b_i`` is the time when the ``i``-th interval begins,
        and ``e_i`` is the time when it ends.

        :param bool speech: if ``True``, return speech intervals,
                            otherwise return nonspeech intervals
        :param bool time: if ``True``, return values in seconds (:class:`~aeneas.timevalue.TimeValue`),
                          otherwise in indices (int)
        :rtype: list of pairs (see above)
        """
        self._ensure_mfcc_mask()
        if speech:
            self.log(u"Converting speech runs to intervals")
            intervals = self.__speech_intervals
        else:
            self.log(u"Converting nonspeech runs to intervals")
            intervals = self.__nonspeech_intervals
        if time:
            mws = self.rconf.mws
            return [(i[0] * mws, (i[1] + 1) * mws) for i in intervals]
        return intervals

    def inside_nonspeech(self, index):
        """
        If ``index`` is contained in a nonspeech interval,
        return a pair ``(interval_begin, interval_end)``
        such that ``interval_begin <= index < interval_end``,
        i.e., ``interval_end`` is assumed not to be included.

        Otherwise, return ``None``.

        :rtype: ``None`` or tuple
        """
        self._ensure_mfcc_mask()
        if (index < 0) or (index >= self.all_length) or (self.__mfcc_mask[index]):
            return None
        return self._binary_search_intervals(self.__nonspeech_intervals, index)

    @classmethod
    def _binary_search_intervals(cls, intervals, index):
        """
        Binary search for the interval containing index,
        assuming there is such an interval.
        This function should never return ``None``.
        """
        start = 0
        end = len(intervals) - 1
        while start <= end:
            middle_index = start + ((end - start) // 2)
            middle = intervals[middle_index]
            if (middle[0] <= index) and (index < middle[1]):
                return middle
            elif middle[0] > index:
                end = middle_index - 1
            else:
                start = middle_index + 1
        return None

    @property
    def middle_begin(self):
        """
        Return the index where MIDDLE starts.

        :rtype: int
        """
        return self.__middle_begin

    @middle_begin.setter
    def middle_begin(self, index):
        """
        Set the index where MIDDLE starts.

        :param int index: the new index for MIDDLE begin
        """
        if (index < 0) or (index > self.all_length):
            raise ValueError(u"The given index is not valid")
        self.__middle_begin = index

    @property
    def middle_begin_seconds(self):
        """
        Return the time instant, in seconds, where MIDDLE starts.

        :rtype: :class:`~aeneas.timevalue.TimeValue`
        """
        return TimeValue(self.__middle_begin) * self.rconf.mws

    @property
    def middle_end(self):
        """
        Return the index (+1) where MIDDLE ends.

        :rtype: int
        """
        return self.__middle_end

    @middle_end.setter
    def middle_end(self, index):
        """
        Set the index (+1) where MIDDLE ends.

        :param int index: the new index for MIDDLE end
        """
        if (index < 0) or (index > self.all_length):
            raise ValueError(u"The given index is not valid")
        self.__middle_end = index

    @property
    def middle_end_seconds(self):
        """
        Return the time instant, in seconds, where MIDDLE ends.

        :rtype: :class:`~aeneas.timevalue.TimeValue`
        """
        return TimeValue(self.__middle_end) * self.rconf.mws

    def _ensure_mfcc_mask(self):
        """
        Ensure that ``run_vad()`` has already been called,
        and hence ``self.__mfcc_mask`` has a meaningful value.
        """
        if self.__mfcc_mask is None:
            self.log(u"VAD was not run: running it now")
            self.run_vad()

    def _compute_mfcc_c_extension(self):
        """
        Compute MFCCs using the Python C extension cmfcc.
        """
        self.log(u"Computing MFCCs using C extension...")
        try:
            self.log(u"Importing cmfcc...")
            import aeneas.cmfcc.cmfcc
            self.log(u"Importing cmfcc... done")
            self.__mfcc = (aeneas.cmfcc.cmfcc.compute_from_data(
                self.audio_file.audio_samples,
                self.audio_file.audio_sample_rate,
                self.rconf[RuntimeConfiguration.MFCC_FILTERS],
                self.rconf[RuntimeConfiguration.MFCC_SIZE],
                self.rconf[RuntimeConfiguration.MFCC_FFT_ORDER],
                self.rconf[RuntimeConfiguration.MFCC_LOWER_FREQUENCY],
                self.rconf[RuntimeConfiguration.MFCC_UPPER_FREQUENCY],
                self.rconf[RuntimeConfiguration.MFCC_EMPHASIS_FACTOR],
                self.rconf[RuntimeConfiguration.MFCC_WINDOW_LENGTH],
                self.rconf[RuntimeConfiguration.MFCC_WINDOW_SHIFT]
            )[0]).transpose()
            self.log(u"Computing MFCCs using C extension... done")
            return (True, None)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while running cmfcc", exc, False, None)
        return (False, None)

    def _compute_mfcc_pure_python(self):
        """
        Compute MFCCs using the pure Python code.
        """
        self.log(u"Computing MFCCs using pure Python code...")
        try:
            self.__mfcc = MFCC(
                rconf=self.rconf,
                logger=self.logger
            ).compute_from_data(
                self.audio_file.audio_samples,
                self.audio_file.audio_sample_rate
            ).transpose()
            self.log(u"Computing MFCCs using pure Python code... done")
            return (True, None)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while running pure Python code", exc, False, None)
        return (False, None)

    def reverse(self):
        """
        Reverse the audio file.

        The reversing is done efficiently using NumPy views inplace
        instead of swapping values.

        Only speech and nonspeech intervals are actually recomputed
        as Python lists.
        """
        self.log(u"Reversing...")
        all_length = self.all_length
        self.__mfcc = self.__mfcc[:, ::-1]
        tmp = self.__middle_end
        self.__middle_end = all_length - self.__middle_begin
        self.__middle_begin = all_length - tmp
        if self.__mfcc_mask is not None:
            self.__mfcc_mask = self.__mfcc_mask[::-1]
            # equivalent to
            # self.__mfcc_mask_map = ((all_length - 1) - self.__mfcc_mask_map)[::-1]
            # but done in place using NumPy view
            self.__mfcc_mask_map *= -1
            self.__mfcc_mask_map += all_length - 1
            self.__mfcc_mask_map = self.__mfcc_mask_map[::-1]
            self.__speech_intervals = [(all_length - i[1], all_length - i[0]) for i in self.__speech_intervals[::-1]]
            self.__nonspeech_intervals = [(all_length - i[1], all_length - i[0]) for i in self.__nonspeech_intervals[::-1]]
        self.is_reversed = not self.is_reversed
        self.log(u"Reversing...done")

    def run_vad(self):
        """
        Determine which frames contain speech and nonspeech,
        and store the resulting boolean mask internally.
        """
        def _compute_runs(array):
            """
            Compute runs as a list of arrays,
            each containing the indices of a contiguous run.

            :param array: the data array
            :type  array: :class:`numpy.ndarray` (1D)
            :rtype: list of :class:`numpy.ndarray` (1D)
            """
            if len(array) < 1:
                return []
            return numpy.split(array, numpy.where(numpy.diff(array) != 1)[0] + 1)
        self.log(u"Creating VAD object")
        vad = VAD(rconf=self.rconf, logger=self.logger)
        self.log(u"Running VAD...")
        self.__mfcc_mask = vad.run_vad(self.__mfcc[0])
        self.__mfcc_mask_map = (numpy.where(self.__mfcc_mask))[0]
        self.log(u"Running VAD... done")
        self.log(u"Storing speech and nonspeech intervals...")
        # where( == True) already computed, reusing
        #runs = _compute_runs((numpy.where(self.__mfcc_mask))[0])
        runs = _compute_runs(self.__mfcc_mask_map)
        self.__speech_intervals = [(r[0], r[-1]) for r in runs]
        # where( == False) not already computed, computing now
        runs = _compute_runs((numpy.where(~self.__mfcc_mask))[0])
        self.__nonspeech_intervals = [(r[0], r[-1]) for r in runs]
        self.log(u"Storing speech and nonspeech intervals... done")

    def set_head_middle_tail(self, head_length=None, middle_length=None, tail_length=None):
        """
        Set the HEAD, MIDDLE, TAIL explicitly.

        If a parameter is ``None``, it will be ignored.
        If both ``middle_length`` and ``tail_length`` are specified,
        only ``middle_length`` will be applied.

        :param head_length: the length of HEAD, in seconds
        :type  head_length: :class:`~aeneas.timevalue.TimeValue`
        :param middle_length: the length of MIDDLE, in seconds
        :type  middle_length: :class:`~aeneas.timevalue.TimeValue`
        :param tail_length: the length of TAIL, in seconds
        :type  tail_length: :class:`~aeneas.timevalue.TimeValue`
        :raises: TypeError: if one of the arguments is not ``None``
                            or :class:`~aeneas.timevalue.TimeValue`
        """
        for variable, name in [
            (head_length, "head_length"),
            (middle_length, "middle_length"),
            (tail_length, "tail_length")
        ]:
            if (variable is not None) and (not isinstance(variable, TimeValue)):
                raise TypeError(u"%s is not None or TimeValue" % name)
        self.log(u"Setting head middle tail...")
        mws = self.rconf.mws
        self.log([u"Before: 0 %d %d %d", self.middle_begin, self.middle_end, self.all_length])
        if head_length is not None:
            self.middle_begin = int(head_length / mws)
        if middle_length is not None:
            self.middle_end = self.middle_begin + int(middle_length / mws)
        elif tail_length is not None:
            self.middle_end = self.all_length - int(tail_length / mws)
        self.log([u"After:  0 %d %d %d", self.middle_begin, self.middle_end, self.all_length])
        self.log(u"Setting head middle tail... done")
예제 #37
0
 def test_load_audio_file(self):
     af = AudioFile(gf.absolute_path(self.AUDIO_FILE_WAVE, __file__))
     af.read_samples_from_file()
     audiofile = AudioFileMFCC(audio_file=af)
     self.assertIsNotNone(audiofile.all_mfcc)
     self.assertAlmostEqual(audiofile.audio_length, TimeValue("53.3"), places=1)     # 53.266
예제 #38
0
    def _synthesize_multiple_python(self,
                                    text_file,
                                    output_file_path,
                                    quit_after=None,
                                    backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # TODO in the Speect Python API I was not able to find a way
        #      to generate the wave incrementally
        #      so I essentially copy the subprocess call mechanism:
        #      generating wave data for each fragment,
        #      and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper(
                text=u"Dummy text to get sample_rate",
                voice_code=self.DEFAULT_LANGUAGE)

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                #
                # NOTE since voice_code is actually ignored
                # in _synthesize_single_helper(),
                # the value of voice_code is irrelevant
                #
                # however, in general you need to apply
                # the _language_to_voice_code() function that maps
                # the text language to a voice code
                #
                # here we apply the _language_to_voice_code() defined in super()
                # that sets voice_code = fragment.language
                #
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code)
                # store for later output
                anchors.append(
                    [current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([
                        u"Quitting after reached duration %.3f", current_time
                    ])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(
                u"An unexpected error occurred while calling TTS engine via Python",
                exc, False, None)
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
예제 #39
0
    def _cut_head_tail(self, audio_file_path):
        """
        Set the audio file head or tail,
        suitably cutting the audio file on disk,
        and setting the corresponding parameters in the task configuration.

        Return a success bool flag
        """
        self._log("Setting head and/or tail")
        try:
            configuration = self.task.configuration
            head_length = configuration.is_audio_file_head_length
            process_length = configuration.is_audio_file_process_length
            detect_head_min = configuration.is_audio_file_detect_head_min
            detect_head_max = configuration.is_audio_file_detect_head_max
            detect_tail_min = configuration.is_audio_file_detect_tail_min
            detect_tail_max = configuration.is_audio_file_detect_tail_max

            # explicit head or process?
            explicit = (head_length is not None) or (process_length
                                                     is not None)

            # at least one detect parameter?
            detect = ((detect_head_min is not None)
                      or (detect_head_max is not None)
                      or (detect_tail_min is not None)
                      or (detect_tail_max is not None))

            if explicit or detect:
                # we need to load the audio data
                audio_file = AudioFile(audio_file_path, logger=self.logger)
                audio_file.load_data()

                if explicit:
                    self._log("Explicit head or process")
                else:
                    self._log(
                        "No explicit head or process => detecting head/tail")

                    head = 0.0
                    if (detect_head_min is not None) or (detect_head_max
                                                         is not None):
                        self._log("Detecting head...")
                        detect_head_min = gf.safe_float(
                            detect_head_min, gc.SD_MIN_HEAD_LENGTH)
                        detect_head_max = gf.safe_float(
                            detect_head_max, gc.SD_MAX_HEAD_LENGTH)
                        self._log(["detect_head_min is %.3f", detect_head_min])
                        self._log(["detect_head_max is %.3f", detect_head_max])
                        sd = SD(audio_file,
                                self.task.text_file,
                                logger=self.logger)
                        head = sd.detect_head(detect_head_min, detect_head_max)
                        self._log(["Detected head: %.3f", head])

                    tail = 0.0
                    if (detect_tail_min is not None) or (detect_tail_max
                                                         is not None):
                        self._log("Detecting tail...")
                        detect_tail_max = gf.safe_float(
                            detect_tail_max, gc.SD_MAX_TAIL_LENGTH)
                        detect_tail_min = gf.safe_float(
                            detect_tail_min, gc.SD_MIN_TAIL_LENGTH)
                        self._log(["detect_tail_min is %.3f", detect_tail_min])
                        self._log(["detect_tail_max is %.3f", detect_tail_max])
                        sd = SD(audio_file,
                                self.task.text_file,
                                logger=self.logger)
                        tail = sd.detect_tail(detect_tail_min, detect_tail_max)
                        self._log(["Detected tail: %.3f", tail])

                    # sanity check
                    head_length = max(0, head)
                    process_length = max(0,
                                         audio_file.audio_length - tail - head)

                    # we need to set these values
                    # in the config object for later use
                    self.task.configuration.is_audio_file_head_length = head_length
                    self.task.configuration.is_audio_file_process_length = process_length
                    self._log(["Set head_length:    %.3f", head_length])
                    self._log(["Set process_length: %.3f", process_length])

                if head_length is not None:
                    # in case we are reading from config object
                    head_length = float(head_length)
                if process_length is not None:
                    # in case we are reading from config object
                    process_length = float(process_length)
                # note that str() is necessary, as one might be None
                self._log(
                    ["is_audio_file_head_length is %s",
                     str(head_length)])
                self._log([
                    "is_audio_file_process_length is %s",
                    str(process_length)
                ])
                self._log("Trimming audio data...")
                audio_file.trim(head_length, process_length)
                self._log("Trimming audio data... done")
                self._log("Writing audio file...")
                audio_file.write(audio_file_path)
                self._log("Writing audio file... done")
                audio_file.clear_data()
            else:
                # nothing to do
                self._log("No explicit head/process or detect head/tail")

            self._log("Setting head and/or tail: succeeded")
            return True
        except Exception as e:
            self._log("Setting head and/or tail: failed")
            self._log(["Message: %s", str(e)])
            return False
예제 #40
0
파일: task.py 프로젝트: ptrwtts/aeneas
class Task(Loggable):
    """
    A structure representing a task, that is,
    an audio file and an ordered set of text fragments
    to be synchronized.

    :param string config_string: the task configuration string
    :param rconf: a runtime configuration
    :type  rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration`
    :param logger: the logger object
    :type  logger: :class:`~aeneas.logger.Logger`
    :raises: TypeError: if ``config_string`` is not ``None`` and
                        it is not a Unicode string
    """

    TAG = u"Task"

    def __init__(self, config_string=None, rconf=None, logger=None):
        super(Task, self).__init__(rconf=rconf, logger=logger)
        self.identifier = gf.uuid_string()
        self.configuration = None
        self.audio_file_path = None # relative to input container root
        self.audio_file_path_absolute = None # concrete path, file will be read from this!
        self.audio_file = None
        self.text_file_path = None # relative to input container root
        self.text_file_path_absolute = None # concrete path, file will be read from this!
        self.text_file = None
        self.sync_map_file_path = None # relative to output container root
        self.sync_map_file_path_absolute = None # concrete path, file will be written to this!
        self.sync_map = None
        if config_string is not None:
            self.configuration = TaskConfiguration(config_string)

    def __unicode__(self):
        msg = [
            u"%s: '%s'" % (gc.RPN_TASK_IDENTIFIER, self.identifier),
            u"Configuration:\n%s" % self.configuration.__unicode__(),
            u"Audio file path: %s" % self.audio_file_path,
            u"Audio file path (absolute): %s" % self.audio_file_path_absolute,
            u"Text file path: %s" % self.text_file_path,
            u"Text file path (absolute): %s" % self.text_file_path_absolute,
            u"Sync map file path: %s" % self.sync_map_file_path,
            u"Sync map file path (absolute): %s" % self.sync_map_file_path_absolute
        ]
        return u"\n".join(msg)

    def __str__(self):
        return gf.safe_str(self.__unicode__())

    @property
    def identifier(self):
        """
        The identifier of the task.

        :rtype: string
        """
        return self.__identifier
    @identifier.setter
    def identifier(self, value):
        self.__identifier = value

    @property
    def audio_file_path_absolute(self):
        """
        The absolute path of the audio file.

        :rtype: string
        """
        return self.__audio_file_path_absolute
    @audio_file_path_absolute.setter
    def audio_file_path_absolute(self, audio_file_path_absolute):
        self.__audio_file_path_absolute = audio_file_path_absolute
        self._populate_audio_file()

    @property
    def text_file_path_absolute(self):
        """
        The absolute path of the text file.

        :rtype: string
        """
        return self.__text_file_path_absolute
    @text_file_path_absolute.setter
    def text_file_path_absolute(self, text_file_path_absolute):
        self.__text_file_path_absolute = text_file_path_absolute
        self._populate_text_file()

    @property
    def sync_map_file_path_absolute(self):
        """
        The absolute path of the sync map file.

        :rtype: string
        """
        return self.__sync_map_file_path_absolute
    @sync_map_file_path_absolute.setter
    def sync_map_file_path_absolute(self, sync_map_file_path_absolute):
        self.__sync_map_file_path_absolute = sync_map_file_path_absolute

    def output_sync_map_file(self, container_root_path=None):
        """
        Output the sync map file for this task.

        If ``container_root_path`` is specified,
        the output sync map file will be created
        at the path obtained by joining
        the ``container_root_path`` and the relative path
        of the sync map inside the container.

        Otherwise, the sync map file will be created at the path
        ``self.sync_map_file_path_absolute``.

        Return the the path of the sync map file created,
        or ``None`` if an error occurred.

        :param string container_root_path: the path to the root directory
                                           for the output container
        :rtype: string
        """
        if self.sync_map is None:
            self.log_exc(u"The sync_map object has not been set", None, True, TypeError)

        if (container_root_path is not None) and (self.sync_map_file_path is None):
            self.log_exc(u"The (internal) path of the sync map has been set", None, True, TypeError)

        self.log([u"container_root_path is %s", container_root_path])
        self.log([u"self.sync_map_file_path is %s", self.sync_map_file_path])
        self.log([u"self.sync_map_file_path_absolute is %s", self.sync_map_file_path_absolute])

        if (container_root_path is not None) and (self.sync_map_file_path is not None):
            path = os.path.join(container_root_path, self.sync_map_file_path)
        elif self.sync_map_file_path_absolute:
            path = self.sync_map_file_path_absolute
        gf.ensure_parent_directory(path)
        self.log([u"Output sync map to %s", path])

        sync_map_format = self.configuration["o_format"]
        audio_ref = self.configuration["o_smil_audio_ref"]
        page_ref = self.configuration["o_smil_page_ref"]

        self.log([u"sync_map_format is %s", sync_map_format])
        self.log([u"page_ref is %s", page_ref])
        self.log([u"audio_ref is %s", audio_ref])

        self.log(u"Calling sync_map.write...")
        afpa = self.audio_file_path_absolute
        if afpa is not None:
            afpa = os.path.abspath(afpa)
        parameters = {
            "audio_file_path_absolute": afpa,
            gc.PPN_TASK_OS_FILE_SMIL_PAGE_REF : page_ref,
            gc.PPN_TASK_OS_FILE_SMIL_AUDIO_REF : audio_ref
        }
        self.sync_map.write(sync_map_format, path, parameters)
        self.log(u"Calling sync_map.write... done")
        return path

    def _populate_audio_file(self):
        """
        Create the ``self.audio_file`` object by reading
        the audio file at ``self.audio_file_path_absolute``.
        """
        self.log(u"Populate audio file...")
        if self.audio_file_path_absolute is not None:
            self.log([u"audio_file_path_absolute is '%s'", self.audio_file_path_absolute])
            self.audio_file = AudioFile(
                file_path=self.audio_file_path_absolute,
                logger=self.logger
            )
            self.audio_file.read_properties()
        else:
            self.log(u"audio_file_path_absolute is None")
        self.log(u"Populate audio file... done")

    def _populate_text_file(self):
        """
        Create the ``self.text_file`` object by reading
        the text file at ``self.text_file_path_absolute``.
        """
        self.log(u"Populate text file...")
        if (
                (self.text_file_path_absolute is not None) and
                (self.configuration["language"] is not None)
            ):
            # the following values might be None
            parameters = {
                gc.PPN_TASK_IS_TEXT_FILE_IGNORE_REGEX : self.configuration["i_t_ignore_regex"],
                gc.PPN_TASK_IS_TEXT_FILE_TRANSLITERATE_MAP : self.configuration["i_t_transliterate_map"],
                gc.PPN_TASK_IS_TEXT_MPLAIN_WORD_SEPARATOR : self.configuration["i_t_mplain_word_separator"],
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX : self.configuration["i_t_munparsed_l1_id_regex"],
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX : self.configuration["i_t_munparsed_l2_id_regex"],
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX : self.configuration["i_t_munparsed_l3_id_regex"],
                gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX : self.configuration["i_t_unparsed_class_regex"],
                gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX : self.configuration["i_t_unparsed_id_regex"],
                gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT : self.configuration["i_t_unparsed_id_sort"],
                gc.PPN_TASK_OS_FILE_ID_REGEX : self.configuration["o_id_regex"]
            }
            self.text_file = TextFile(
                file_path=self.text_file_path_absolute,
                file_format=self.configuration["i_t_format"],
                parameters=parameters,
                logger=self.logger
            )
            self.text_file.set_language(self.configuration["language"])
        else:
            self.log(u"text_file_path_absolute and/or language is None")
        self.log(u"Populate text file... done")
예제 #41
0
    def _synthesize_multiple_subprocess(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple fragments via ``subprocess``.

        :rtype: tuple (result, (anchors, current_time, num_chars))
        """
        def synthesize_and_clean(text, voice_code):
            """
            Synthesize a single fragment via subprocess,
            and immediately remove the temporary file.

            :rtype: tuple (duration, sample_rate, encoding, samples)
            """
            self.log(u"Synthesizing text...")
            handler, tmp_destination = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
            result, data = self._synthesize_single_subprocess(
                text=(text + u" "),
                voice_code=voice_code,
                output_file_path=tmp_destination
            )
            self.log([u"Removing temporary file '%s'", tmp_destination])
            gf.delete_file(handler, tmp_destination)
            self.log(u"Synthesizing text... done")
            return data

        self.log(u"Calling TTS engine via subprocess...")

        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = synthesize_and_clean(
                text=u"Dummy text to get sample_rate",
                voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE)
            )

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, samples = synthesize_and_clean(
                    text=fragment.filtered_text,
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # concatenate new samples
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    output_file.add_samples(samples, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # minimize memory
            self.log(u"Minimizing memory...")
            output_file.minimize_memory()
            self.log(u"Minimizing memory... done")

            # if backwards, we need to reverse the audio samples again
            if backwards:
                self.log(u"Reversing audio samples...")
                output_file.reverse()
                self.log(u"Reversing audio samples... done")

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while calling TTS engine via subprocess", exc, False, None)
            return (False, None)

        # return output
        if backwards:
            self.log_warn(u"Please note that anchor time values do not make sense since backwards=True")
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via subprocess... done")
        return (True, (anchors, current_time, num_chars))
예제 #42
0
    def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # TODO in the Speect Python API I was not able to find a way
        #      to generate the wave incrementally
        #      so I essentially copy the subprocess call mechanism:
        #      generating wave data for each fragment,
        #      and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper(
                text=u"Dummy text to get sample_rate",
                voice_code=self.DEFAULT_LANGUAGE
            )

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                #
                # NOTE since voice_code is actually ignored
                # in _synthesize_single_helper(),
                # the value of voice_code is irrelevant
                #
                # however, in general you need to apply
                # the _language_to_voice_code() function that maps
                # the text language to a voice code
                #
                # here we apply the _language_to_voice_code() defined in super()
                # that sets voice_code = fragment.language
                #
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while calling TTS engine via Python", exc, False, None)
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
예제 #43
0
 def test_preallocate(self):
     audiofile = AudioFile()
     with self.assertRaises(AudioFileNotInitializedError):
         audiofile.audio_samples
     audiofile.preallocate_memory(100)
     self.assertEqual(len(audiofile.audio_samples), 0)
예제 #44
0
    def _synthesize_multiple_python(self,
                                    text_file,
                                    output_file_path,
                                    quit_after=None,
                                    backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # generating wave data for each fragment,
        # and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = "pcm16"
            output_file.audio_channels = 1
            output_file.audio_sample_rate = self.SAMPLE_RATE

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code)
                # store for later output
                anchors.append(
                    [current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([
                        u"Quitting after reached duration %.3f", current_time
                    ])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(
                u"Unexpected exception while calling TTS engine via Python",
                exc, None, type(exc))
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
예제 #45
0
class Task(object):
    """
    A structure representing a task, that is,
    an audio file and a list of text fragments
    to be synchronized.

    :param config_string: the task configuration string
    :type  config_string: string
    """

    TAG = "Task"

    def __init__(self, config_string=None):
        # task properties
        self.identifier = str(uuid.uuid4()).lower()
        self.configuration = None
        self.audio_file_path = None # relative to input container root
        self.audio_file_path_absolute = None # concrete path, file will be read from this!
        self.audio_file = None
        self.text_file_path = None # relative to input container root
        self.text_file_path_absolute = None # concrete path, file will be read from this!
        self.text_file = None
        self.sync_map_file_path = None # relative to output container root
        self.sync_map_file_path_absolute = None # concrete path, file will be written to this!
        self.sync_map = None
        if config_string is not None:
            self.configuration = TaskConfiguration(config_string)

    def __str__(self):
        accumulator = ""
        accumulator += "%s: '%s'\n" % (gc.RPN_TASK_IDENTIFIER, self.identifier)
        accumulator += "Configuration:\n%s\n" % str(self.configuration)
        accumulator += "Audio file path: %s\n" % self.audio_file_path
        accumulator += "Audio file path (absolute): %s\n" % self.audio_file_path_absolute
        accumulator += "Text file path: %s\n" % self.text_file_path
        accumulator += "Text file path (absolute): %s\n" % self.text_file_path_absolute
        accumulator += "Sync map file path: %s\n" % self.sync_map_file_path
        accumulator += "Sync map file path (absolute): %s\n" % self.sync_map_file_path_absolute
        return accumulator

    @property
    def identifier(self):
        """
        The identifier of the task.

        :rtype: string
        """
        return self.__identifier
    @identifier.setter
    def identifier(self, value):
        self.__identifier = value

    @property
    def audio_file_path_absolute(self):
        """
        The absolute path of the audio file.

        :rtype: string (path)
        """
        return self.__audio_file_path_absolute
    @audio_file_path_absolute.setter
    def audio_file_path_absolute(self, audio_file_path_absolute):
        self.__audio_file_path_absolute = audio_file_path_absolute
        self._populate_audio_file()

    @property
    def text_file_path_absolute(self):
        """
        The absolute path of the text file.

        :rtype: string (path)
        """
        return self.__text_file_path_absolute
    @text_file_path_absolute.setter
    def text_file_path_absolute(self, text_file_path_absolute):
        self.__text_file_path_absolute = text_file_path_absolute
        self._populate_text_file()

    @property
    def sync_map_file_path_absolute(self):
        """
        The absolute path of the sync map file.

        :rtype: string (path)
        """
        return self.__sync_map_file_path_absolute
    @sync_map_file_path_absolute.setter
    def sync_map_file_path_absolute(self, sync_map_file_path_absolute):
        self.__sync_map_file_path_absolute = sync_map_file_path_absolute

    def _populate_audio_file(self):
        """
        Create the ``self.audio_file`` object by reading
        the audio file at ``self.audio_file_path_absolute``.
        """
        if self.audio_file_path_absolute is not None:
            self.audio_file = AudioFile(
                file_path=self.audio_file_path_absolute,
                logger=None
            )
            self.audio_file.read_properties()

    def _populate_text_file(self):
        """
        Create the ``self.text_file`` object by reading
        the text file at ``self.text_file_path_absolute``.
        """
        if ((self.text_file_path_absolute is not None) and
                (self.configuration.language is not None)):
            parameters = dict()
            parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX] = self.configuration.is_text_unparsed_class_regex
            parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX] = self.configuration.is_text_unparsed_id_regex
            parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT] = self.configuration.is_text_unparsed_id_sort
            self.text_file = TextFile(
                file_path=self.text_file_path_absolute,
                file_format=self.configuration.is_text_file_format,
                parameters=parameters,
                logger=None
            )
            self.text_file.set_language(self.configuration.language)

    def output_sync_map_file(self, container_root_path=None):
        """
        Output the sync map file for this task.

        If ``container_root_path`` is specified,
        the output sync map file will be created
        at the path obtained by joining
        the ``container_root_path`` and the relative path
        of the sync map inside the container.

        Otherwise, the sync map file will be created at the path
        ``sync_map_file_path_absolute``.

        Return the the path of the sync map file created,
        or ``None`` if an error occurred.

        :param container_root_path: the path to the root directory
                                    for the output container
        :type  container_root_path: string (path)
        :rtype: return the path of the sync map file created
        """
        if self.sync_map is None:
            return None

        if (container_root_path is not None) and (self.sync_map_file_path is None):
            return None

        if (container_root_path is not None) and (self.sync_map_file_path is not None):
            path = os.path.join(container_root_path, self.sync_map_file_path)
        elif self.sync_map_file_path_absolute:
            path = self.sync_map_file_path_absolute

        sync_map_format = self.configuration.os_file_format
        parameters = dict()
        parameters[gc.PPN_TASK_OS_FILE_SMIL_PAGE_REF] = self.configuration.os_file_smil_page_ref
        parameters[gc.PPN_TASK_OS_FILE_SMIL_AUDIO_REF] = self.configuration.os_file_smil_audio_ref
        result = self.sync_map.write(sync_map_format, path, parameters)
        if not result:
            return None
        return path
예제 #46
0
class Task(object):
    """
    A structure representing a task, that is,
    an audio file and a list of text fragments
    to be synchronized.

    :param config_string: the task configuration string
    :type  config_string: string
    """

    TAG = "Task"

    def __init__(self, config_string=None):
        # task properties
        self.identifier = str(uuid.uuid4()).lower()
        self.configuration = None
        self.audio_file_path = None  # relative to input container root
        self.audio_file_path_absolute = None  # concrete path, file will be read from this!
        self.audio_file = None
        self.text_file_path = None  # relative to input container root
        self.text_file_path_absolute = None  # concrete path, file will be read from this!
        self.text_file = None
        self.sync_map_file_path = None  # relative to output container root
        self.sync_map_file_path_absolute = None  # concrete path, file will be written to this!
        self.sync_map = None
        if config_string is not None:
            self.configuration = TaskConfiguration(config_string)

    def __str__(self):
        accumulator = ""
        accumulator += "%s: '%s'\n" % (gc.RPN_TASK_IDENTIFIER, self.identifier)
        accumulator += "Configuration:\n%s\n" % str(self.configuration)
        accumulator += "Audio file path: %s\n" % self.audio_file_path
        accumulator += "Audio file path (absolute): %s\n" % self.audio_file_path_absolute
        accumulator += "Text file path: %s\n" % self.text_file_path
        accumulator += "Text file path (absolute): %s\n" % self.text_file_path_absolute
        accumulator += "Sync map file path: %s\n" % self.sync_map_file_path
        accumulator += "Sync map file path (absolute): %s\n" % self.sync_map_file_path_absolute
        return accumulator

    @property
    def identifier(self):
        """
        The identifier of the task.

        :rtype: string
        """
        return self.__identifier

    @identifier.setter
    def identifier(self, value):
        self.__identifier = value

    @property
    def audio_file_path_absolute(self):
        """
        The absolute path of the audio file.

        :rtype: string (path)
        """
        return self.__audio_file_path_absolute

    @audio_file_path_absolute.setter
    def audio_file_path_absolute(self, audio_file_path_absolute):
        self.__audio_file_path_absolute = audio_file_path_absolute
        self._populate_audio_file()

    @property
    def text_file_path_absolute(self):
        """
        The absolute path of the text file.

        :rtype: string (path)
        """
        return self.__text_file_path_absolute

    @text_file_path_absolute.setter
    def text_file_path_absolute(self, text_file_path_absolute):
        self.__text_file_path_absolute = text_file_path_absolute
        self._populate_text_file()

    @property
    def sync_map_file_path_absolute(self):
        """
        The absolute path of the sync map file.

        :rtype: string (path)
        """
        return self.__sync_map_file_path_absolute

    @sync_map_file_path_absolute.setter
    def sync_map_file_path_absolute(self, sync_map_file_path_absolute):
        self.__sync_map_file_path_absolute = sync_map_file_path_absolute

    def _populate_audio_file(self):
        """
        Create the ``self.audio_file`` object by reading
        the audio file at ``self.audio_file_path_absolute``.
        """
        if self.audio_file_path_absolute is not None:
            self.audio_file = AudioFile(
                file_path=self.audio_file_path_absolute, logger=None)
            self.audio_file.read_properties()

    def _populate_text_file(self):
        """
        Create the ``self.text_file`` object by reading
        the text file at ``self.text_file_path_absolute``.
        """
        if ((self.text_file_path_absolute is not None)
                and (self.configuration.language is not None)):
            parameters = dict()
            parameters[
                gc.
                PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX] = self.configuration.is_text_unparsed_class_regex
            parameters[
                gc.
                PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX] = self.configuration.is_text_unparsed_id_regex
            parameters[
                gc.
                PPN_TASK_IS_TEXT_UNPARSED_ID_SORT] = self.configuration.is_text_unparsed_id_sort
            self.text_file = TextFile(
                file_path=self.text_file_path_absolute,
                file_format=self.configuration.is_text_file_format,
                parameters=parameters,
                logger=None)
            self.text_file.set_language(self.configuration.language)

    def output_sync_map_file(self, container_root_path=None):
        """
        Output the sync map file for this task.

        If ``container_root_path`` is specified,
        the output sync map file will be created
        at the path obtained by joining
        the ``container_root_path`` and the relative path
        of the sync map inside the container.

        Otherwise, the sync map file will be created at the path
        ``sync_map_file_path_absolute``.

        Return the the path of the sync map file created,
        or ``None`` if an error occurred.

        :param container_root_path: the path to the root directory
                                    for the output container
        :type  container_root_path: string (path)
        :rtype: return the path of the sync map file created
        """
        if self.sync_map is None:
            return None

        if (container_root_path
                is not None) and (self.sync_map_file_path is None):
            return None

        if (container_root_path is not None) and (self.sync_map_file_path
                                                  is not None):
            path = os.path.join(container_root_path, self.sync_map_file_path)
        elif self.sync_map_file_path_absolute:
            path = self.sync_map_file_path_absolute

        sync_map_format = self.configuration.os_file_format
        parameters = dict()
        parameters[
            gc.
            PPN_TASK_OS_FILE_SMIL_PAGE_REF] = self.configuration.os_file_smil_page_ref
        parameters[
            gc.
            PPN_TASK_OS_FILE_SMIL_AUDIO_REF] = self.configuration.os_file_smil_audio_ref
        result = self.sync_map.write(sync_map_format, path, parameters)
        if not result:
            return None
        return path
예제 #47
0
    def _detect_start(self,
                      min_start_length,
                      max_start_length,
                      metric,
                      backwards=False):
        """ Detect start """

        self._log(["Min start length: %.3f", min_start_length])
        self._log(["Max start length: %.3f", max_start_length])
        self._log(["Metric:           %s", metric])
        self._log(["Backwards:        %s", str(backwards)])

        audio_rate = self.text_file.characters / self.audio_file.audio_length
        self._log(["Audio rate:     %.3f", audio_rate])

        self._log("Synthesizing query...")
        tmp_handler, tmp_file_path = tempfile.mkstemp(suffix=".wav",
                                                      dir=gf.custom_tmp_dir())
        synt = Synthesizer(logger=self.logger)
        synt_duration = max_start_length * self.QUERY_FACTOR
        self._log(["Synthesizing %.3f seconds", synt_duration])
        result = synt.synthesize(self.text_file,
                                 tmp_file_path,
                                 quit_after=synt_duration,
                                 backwards=backwards)
        self._log("Synthesizing query... done")

        query_file = AudioFile(tmp_file_path)
        if backwards:
            self._log("Reversing query")
            query_file.reverse()
        self._log("Extracting MFCCs for query...")
        query_file.extract_mfcc(frame_rate=self.frame_rate)
        query_file.clear_data()
        self._log("Extracting MFCCs for query... done")

        self._log("Cleaning up...")
        self._cleanup(tmp_handler, tmp_file_path)
        self._log("Cleaning up... done")

        query_characters = result[2]
        query_len = query_file.audio_length
        query_mfcc = query_file.audio_mfcc
        query_rate = query_characters / query_len

        stretch_factor = max(1, query_rate / audio_rate)
        self._log(["Audio rate:     %.3f", audio_rate])
        self._log(["Query rate:     %.3f", query_rate])
        self._log(["Stretch factor: %.3f", stretch_factor])

        audio_mfcc = self.audio_file.audio_mfcc
        self._log(["Actual audio has %d frames", audio_mfcc.shape[1]])
        audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR *
                                   self.frame_rate)
        self._log(["Limiting audio to first %d frames", audio_mfcc_end_index])
        audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1])
        audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index]
        self._log(["Limited audio has %d frames", audio_mfcc.shape[1]])

        l, o = audio_mfcc.shape
        l, n = query_mfcc.shape

        # minimum length of a matched interval in the real audio
        stretched_match_minimum_length = int(n * stretch_factor)

        self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)])
        self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)])
        self._log(["Stretch factor:          %.3f", stretch_factor])
        self._log(
            ["Required minimum length: %.3f", stretched_match_minimum_length])
        self._log("Speech intervals:")
        for interval in self.audio_speech:
            self._log([
                "  %d %d == %.3f %.3f",
                self._t2i(interval[0]),
                self._t2i(interval[1]), interval[0], interval[1]
            ])

        admissible_intervals = [
            x for x in self.audio_speech
            if ((x[0] >= min_start_length) and (x[0] <= max_start_length))
        ]
        self._log("AdmissibleSpeech intervals:")
        for interval in admissible_intervals:
            self._log([
                "  %d %d == %.3f %.3f",
                self._t2i(interval[0]),
                self._t2i(interval[1]), interval[0], interval[1]
            ])

        candidates = []
        runs_with_min_length = 0
        runs_no_improvement = 0
        runs_min_distortion = numpy.inf
        runs_min_value = numpy.inf

        for interval in admissible_intervals:
            if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT:
                self._log("  Breaking: too many runs without improvement")
                break

            if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH:
                self._log(
                    "  Breaking: too many runs with minimum required length")
                break

            start_time = interval[0]
            start_index = self._t2i(start_time)
            self._log([
                "Evaluating interval starting at %d == %.3f ", start_index,
                start_time
            ])
            if start_index > o:
                self._log("  Breaking: start index outside audio window")
                break

            req_end_index = start_index + stretched_match_minimum_length
            req_end_time = self._i2t(req_end_index)
            if req_end_index > o:
                self._log(
                    "  Breaking: not enough audio left in shifted window")
                break
            end_index = min(start_index + 2 * n, o)
            end_time = self._i2t(end_index)

            self._log(["  Start   %d == %.3f", start_index, start_time])
            self._log(["  Req end %d == %.3f", req_end_index, req_end_time])
            self._log(["  Eff end %d == %.3f", end_index, end_time])

            audio_mfcc_sub = audio_mfcc[:, start_index:end_index]
            l, m = audio_mfcc_sub.shape

            self._log("Computing DTW...")
            aligner = DTWAligner(None,
                                 None,
                                 frame_rate=self.frame_rate,
                                 logger=self.logger)
            aligner.real_wave_full_mfcc = audio_mfcc_sub
            aligner.synt_wave_full_mfcc = query_mfcc
            aligner.real_wave_length = self._i2t(m)
            aligner.synt_wave_length = self._i2t(n)
            acm = aligner.compute_accumulated_cost_matrix()
            # transpose, so we have an n x m accumulated cost matrix
            acm = acm.transpose()
            last_row = acm[-1, :]
            self._log("Computing DTW... done")

            # find the minimum, but its index must be >= stretched_match_minimum_length
            candidate_argmin_index = numpy.argmin(
                last_row[stretched_match_minimum_length:])
            candidate_length_index = stretched_match_minimum_length + candidate_argmin_index
            candidate_length_time = self._i2t(candidate_length_index)
            candidate_value = last_row[candidate_length_index]
            candidate_end_index = start_index + candidate_length_index
            candidate_end_time = self._i2t(candidate_end_index)
            candidate_distortion = candidate_value / candidate_length_index

            # check if the candidate has minimum length
            if candidate_length_index == stretched_match_minimum_length:
                runs_with_min_length += 1
            else:
                runs_with_min_length = 0

            # check if the candidate improved the global minimum value
            if metric == SDMetric.VALUE:
                if candidate_value < runs_min_value:
                    runs_min_value = candidate_value
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1
            if metric == SDMetric.DISTORTION:
                if candidate_distortion < runs_min_distortion:
                    runs_min_distortion = candidate_distortion
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1

            # append to the list of candidates
            self._log([
                "    Interval  start:      %d == %.6f", start_index, start_time
            ])
            self._log(
                ["    Interval  end:        %d == %.6f", end_index, end_time])
            self._log([
                "    Candidate start:      %d == %.6f", start_index, start_time
            ])
            self._log([
                "    Candidate end:        %d == %.6f", candidate_end_index,
                candidate_end_time
            ])
            self._log([
                "    Candidate length:     %d == %.6f", candidate_length_index,
                candidate_length_time
            ])
            self._log(["    Candidate value:      %.6f", candidate_value])
            self._log(["    Candidate distortion: %.6f", candidate_distortion])
            candidates.append({
                "start_index": start_index,
                "length": candidate_length_index,
                "value": candidate_value,
                "distortion": candidate_distortion
            })

        # select best candidate and return its start time
        # if we have no best candidate, return 0.0
        best_candidate = self._select_best_candidate(candidates, metric)
        if best_candidate is None:
            return 0.0
        sd_time = self._i2t(max(best_candidate["start_index"], 0))
        self._log(["Returning time %.3f", sd_time])
        return sd_time
예제 #48
0
class AudioFileMFCC(Loggable):
    """
    A monoaural (single channel) WAVE audio file,
    represented as a NumPy 2D matrix of
    Mel-frequency ceptral coefficients (MFCC).

    The matrix is "fat", that is,
    its number of rows is equal to the number of MFCC coefficients
    and its number of columns is equal to the number of window shifts
    in the audio file.
    The number of MFCC coefficients and the MFCC window shift can
    be modified via the
    :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_SIZE`
    and
    :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_WINDOW_SHIFT`
    keys in the ``rconf`` object.

    If ``mfcc_matrix`` is not ``None``,
    it will be used as the MFCC matrix.

    If ``file_path`` or ``audio_file`` is not ``None``,
    the MFCCs will be computed upon creation of the object,
    possibly converting to PCM16 Mono WAVE and/or
    loading audio data in memory.

    The MFCCs for the entire wave
    are divided into three
    contiguous intervals (possibly, zero-length)::

        HEAD   = [:middle_begin[
        MIDDLE = [middle_begin:middle_end[
        TAIL   = [middle_end:[

    The usual NumPy convention of including the left/start index
    and excluding the right/end index is adopted.

    For alignment purposes, only the ``MIDDLE`` portion of the wave
    is taken into account; the ``HEAD`` and ``TAIL`` intervals are ignored.

    This class heavily uses NumPy views and in-place operations
    to avoid creating temporary data or copying data around.

    :param string file_path: the path of the PCM16 mono WAVE file, or ``None``
    :param tuple file_format: the format of the audio file, if known in advance: ``(codec, channels, rate)`` or ``None``
    :param mfcc_matrix: the MFCC matrix to be set, or ``None``
    :type  mfcc_matrix: :class:`numpy.ndarray`
    :param audio_file: an audio file, or ``None``
    :type  audio_file: :class:`~aeneas.audiofile.AudioFile`
    :param rconf: a runtime configuration
    :type  rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration`
    :param logger: the logger object
    :type  logger: :class:`~aeneas.logger.Logger`
    :raises: ValueError: if ``file_path``, ``audio_file``, and ``mfcc_matrix`` are all ``None``

    .. versionadded:: 1.5.0
    """

    TAG = u"AudioFileMFCC"

    def __init__(self,
                 file_path=None,
                 file_format=None,
                 mfcc_matrix=None,
                 audio_file=None,
                 rconf=None,
                 logger=None):
        if (file_path is None) and (audio_file is None) and (mfcc_matrix is
                                                             None):
            raise ValueError(
                u"You must initialize with at least one of: file_path, audio_file, or mfcc_matrix"
            )
        super(AudioFileMFCC, self).__init__(rconf=rconf, logger=logger)
        self.file_path = file_path
        self.audio_file = audio_file
        self.is_reversed = False
        self.__mfcc = None
        self.__mfcc_mask = None
        self.__mfcc_mask_map = None
        self.__speech_intervals = None
        self.__nonspeech_intervals = None
        self.log(u"Initializing MFCCs...")
        if mfcc_matrix is not None:
            self.__mfcc = mfcc_matrix
            self.audio_length = self.all_length * self.rconf.mws
        elif (self.file_path is not None) or (self.audio_file is not None):
            audio_file_was_none = False
            if self.audio_file is None:
                audio_file_was_none = True
                self.audio_file = AudioFile(file_path=self.file_path,
                                            file_format=file_format,
                                            rconf=self.rconf,
                                            logger=self.logger)
                # NOTE load audio samples into memory, if not present already
                self.audio_file.audio_samples
            gf.run_c_extension_with_fallback(self.log,
                                             "cmfcc",
                                             self._compute_mfcc_c_extension,
                                             self._compute_mfcc_pure_python,
                                             (),
                                             rconf=self.rconf)
            self.audio_length = self.audio_file.audio_length
            if audio_file_was_none:
                self.log(u"Clearing the audio data...")
                self.audio_file.clear_data()
                self.audio_file = None
                self.log(u"Clearing the audio data... done")
        self.__middle_begin = 0
        self.__middle_end = self.__mfcc.shape[1]
        self.log(u"Initializing MFCCs... done")

    def __unicode__(self):
        msg = [
            u"File path:        %s" % self.file_path,
            u"Audio length (s): %s" % gf.safe_float(self.audio_length),
        ]
        return u"\n".join(msg)

    def __str__(self):
        return gf.safe_str(self.__unicode__())

    @property
    def all_mfcc(self):
        """
        The MFCCs of the entire audio file,
        that is, HEAD + MIDDLE + TAIL.

        :rtype: :class:`numpy.ndarray` (2D)
        """
        return self.__mfcc

    @property
    def all_length(self):
        """
        The length, in MFCC coefficients,
        of the entire audio file,
        that is, HEAD + MIDDLE + TAIL.

        :rtype: int
        """
        return self.__mfcc.shape[1]

    @property
    def middle_mfcc(self):
        """
        The MFCCs of the middle part of the audio file,
        that is, without HEAD and TAIL.

        :rtype: :class:`numpy.ndarray` (2D)
        """
        return self.__mfcc[:, self.__middle_begin:self.__middle_end]

    @property
    def middle_length(self):
        """
        The length, in MFCC coefficients,
        of the middle part of the audio file,
        that is, without HEAD and TAIL.

        :rtype: int
        """
        return self.__middle_end - self.__middle_begin

    @property
    def middle_map(self):
        """
        Return the map
        from the MFCC frame indices
        in the MIDDLE portion of the wave
        to the MFCC FULL frame indices,
        that is, an ``numpy.arange(self.middle_begin, self.middle_end)``.

        NOTE: to translate indices of MIDDLE,
        instead of using fancy indexing with the
        result of this function, you might want to simply
        add ``self.head_length``.
        This function is provided mostly for consistency
        with the MASKED case.

        :rtype: :class:`numpy.ndarray` (1D)
        """
        return numpy.arange(self.__middle_begin, self.__middle_end)

    @property
    def head_length(self):
        """
        The length, in MFCC coefficients,
        of the HEAD of the audio file.

        :rtype: int
        """
        return self.__middle_begin

    @property
    def tail_length(self):
        """
        The length, in MFCC coefficients,
        of the TAIL of the audio file.

        :rtype: int
        """
        return self.all_length - self.__middle_end

    @property
    def tail_begin(self):
        """
        The index, in MFCC coefficients,
        where the TAIL of the audio file starts.

        :rtype: int
        """
        return self.__middle_end

    @property
    def audio_length(self):
        """
        The length, in seconds, of the audio file.

        This value is the actual length of the audio file,
        computed as ``number of samples / sample_rate``,
        hence it might differ than ``len(self.__mfcc) * mfcc_window_shift``.

        :rtype: :class:`~aeneas.timevalue.TimeValue`
        """
        return self.__audio_length

    @audio_length.setter
    def audio_length(self, audio_length):
        self.__audio_length = audio_length

    @property
    def is_reversed(self):
        """
        Return ``True`` if currently reversed.

        :rtype: bool
        """
        return self.__is_reversed

    @is_reversed.setter
    def is_reversed(self, is_reversed):
        self.__is_reversed = is_reversed

    @property
    def masked_mfcc(self):
        """
        Return the MFCC speech frames
        in the FULL wave.

        :rtype: :class:`numpy.ndarray` (2D)
        """
        self._ensure_mfcc_mask()
        return self.__mfcc[:, self.__mfcc_mask]

    @property
    def masked_length(self):
        """
        Return the number of MFCC speech frames
        in the FULL wave.

        :rtype: int
        """
        self._ensure_mfcc_mask()
        return len(self.__mfcc_mask_map)

    @property
    def masked_map(self):
        """
        Return the map
        from the MFCC speech frame indices
        to the MFCC FULL frame indices.

        :rtype: :class:`numpy.ndarray` (1D)
        """
        self._ensure_mfcc_mask()
        return self.__mfcc_mask_map

    @property
    def masked_middle_mfcc(self):
        """
        Return the MFCC speech frames
        in the MIDDLE portion of the wave.

        :rtype: :class:`numpy.ndarray` (2D)
        """
        begin, end = self._masked_middle_begin_end()
        return (self.masked_mfcc)[:, begin:end]

    @property
    def masked_middle_length(self):
        """
        Return the number of MFCC speech frames
        in the MIDDLE portion of the wave.

        :rtype: int
        """
        begin, end = self._masked_middle_begin_end()
        return end - begin

    @property
    def masked_middle_map(self):
        """
        Return the map
        from the MFCC speech frame indices
        in the MIDDLE portion of the wave
        to the MFCC FULL frame indices.

        :rtype: :class:`numpy.ndarray` (1D)
        """
        begin, end = self._masked_middle_begin_end()
        return self.__mfcc_mask_map[begin:end]

    def _masked_middle_begin_end(self):
        """
        Return the begin and end indices w.r.t. ``self.__mfcc_mask_map``,
        corresponding to indices in the MIDDLE portion of the wave,
        that is, which fall between ``self.__middle_begin`` and
        ``self.__middle_end`` in ``self.__mfcc``.

        :rtype: (int, int)
        """
        self._ensure_mfcc_mask()
        begin = numpy.searchsorted(self.__mfcc_mask_map,
                                   self.__middle_begin,
                                   side="left")
        end = numpy.searchsorted(self.__mfcc_mask_map,
                                 self.__middle_end,
                                 side="right")
        return (begin, end)

    def intervals(self, speech=True, time=True):
        """
        Return a list of intervals::

        [(b_1, e_1), (b_2, e_2), ..., (b_k, e_k)]

        where ``b_i`` is the time when the ``i``-th interval begins,
        and ``e_i`` is the time when it ends.

        :param bool speech: if ``True``, return speech intervals,
                            otherwise return nonspeech intervals
        :param bool time: if ``True``, return values in seconds (:class:`~aeneas.timevalue.TimeValue`),
                          otherwise in indices (int)
        :rtype: list of pairs (see above)
        """
        self._ensure_mfcc_mask()
        if speech:
            self.log(u"Converting speech runs to intervals")
            intervals = self.__speech_intervals
        else:
            self.log(u"Converting nonspeech runs to intervals")
            intervals = self.__nonspeech_intervals
        if time:
            mws = self.rconf.mws
            return [(i[0] * mws, (i[1] + 1) * mws) for i in intervals]
        return intervals

    def inside_nonspeech(self, index):
        """
        If ``index`` is contained in a nonspeech interval,
        return a pair ``(interval_begin, interval_end)``
        such that ``interval_begin <= index < interval_end``,
        i.e., ``interval_end`` is assumed not to be included.

        Otherwise, return ``None``.

        :rtype: ``None`` or tuple
        """
        self._ensure_mfcc_mask()
        if (index < 0) or (index >=
                           self.all_length) or (self.__mfcc_mask[index]):
            return None
        return self._binary_search_intervals(self.__nonspeech_intervals, index)

    @classmethod
    def _binary_search_intervals(cls, intervals, index):
        """
        Binary search for the interval containing index,
        assuming there is such an interval.
        This function should never return ``None``.
        """
        start = 0
        end = len(intervals) - 1
        while start <= end:
            middle_index = start + ((end - start) // 2)
            middle = intervals[middle_index]
            if (middle[0] <= index) and (index < middle[1]):
                return middle
            elif middle[0] > index:
                end = middle_index - 1
            else:
                start = middle_index + 1
        return None

    @property
    def middle_begin(self):
        """
        Return the index where MIDDLE starts.

        :rtype: int
        """
        return self.__middle_begin

    @middle_begin.setter
    def middle_begin(self, index):
        """
        Set the index where MIDDLE starts.

        :param int index: the new index for MIDDLE begin
        """
        if (index < 0) or (index > self.all_length):
            raise ValueError(u"The given index is not valid")
        self.__middle_begin = index

    @property
    def middle_begin_seconds(self):
        """
        Return the time instant, in seconds, where MIDDLE starts.

        :rtype: :class:`~aeneas.timevalue.TimeValue`
        """
        return TimeValue(self.__middle_begin) * self.rconf.mws

    @property
    def middle_end(self):
        """
        Return the index (+1) where MIDDLE ends.

        :rtype: int
        """
        return self.__middle_end

    @middle_end.setter
    def middle_end(self, index):
        """
        Set the index (+1) where MIDDLE ends.

        :param int index: the new index for MIDDLE end
        """
        if (index < 0) or (index > self.all_length):
            raise ValueError(u"The given index is not valid")
        self.__middle_end = index

    @property
    def middle_end_seconds(self):
        """
        Return the time instant, in seconds, where MIDDLE ends.

        :rtype: :class:`~aeneas.timevalue.TimeValue`
        """
        return TimeValue(self.__middle_end) * self.rconf.mws

    def _ensure_mfcc_mask(self):
        """
        Ensure that ``run_vad()`` has already been called,
        and hence ``self.__mfcc_mask`` has a meaningful value.
        """
        if self.__mfcc_mask is None:
            self.log(u"VAD was not run: running it now")
            self.run_vad()

    def _compute_mfcc_c_extension(self):
        """
        Compute MFCCs using the Python C extension cmfcc.
        """
        self.log(u"Computing MFCCs using C extension...")
        try:
            self.log(u"Importing cmfcc...")
            import aeneas.cmfcc.cmfcc
            self.log(u"Importing cmfcc... done")
            self.__mfcc = (aeneas.cmfcc.cmfcc.compute_from_data(
                self.audio_file.audio_samples,
                self.audio_file.audio_sample_rate,
                self.rconf[RuntimeConfiguration.MFCC_FILTERS],
                self.rconf[RuntimeConfiguration.MFCC_SIZE],
                self.rconf[RuntimeConfiguration.MFCC_FFT_ORDER],
                self.rconf[RuntimeConfiguration.MFCC_LOWER_FREQUENCY],
                self.rconf[RuntimeConfiguration.MFCC_UPPER_FREQUENCY],
                self.rconf[RuntimeConfiguration.MFCC_EMPHASIS_FACTOR],
                self.rconf[RuntimeConfiguration.MFCC_WINDOW_LENGTH],
                self.rconf[RuntimeConfiguration.MFCC_WINDOW_SHIFT])[0]
                           ).transpose()
            self.log(u"Computing MFCCs using C extension... done")
            return (True, None)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while running cmfcc",
                         exc, False, None)
        return (False, None)

    def _compute_mfcc_pure_python(self):
        """
        Compute MFCCs using the pure Python code.
        """
        self.log(u"Computing MFCCs using pure Python code...")
        try:
            self.__mfcc = MFCC(
                rconf=self.rconf, logger=self.logger).compute_from_data(
                    self.audio_file.audio_samples,
                    self.audio_file.audio_sample_rate).transpose()
            self.log(u"Computing MFCCs using pure Python code... done")
            return (True, None)
        except Exception as exc:
            self.log_exc(
                u"An unexpected error occurred while running pure Python code",
                exc, False, None)
        return (False, None)

    def reverse(self):
        """
        Reverse the audio file.

        The reversing is done efficiently using NumPy views inplace
        instead of swapping values.

        Only speech and nonspeech intervals are actually recomputed
        as Python lists.
        """
        self.log(u"Reversing...")
        all_length = self.all_length
        self.__mfcc = self.__mfcc[:, ::-1]
        tmp = self.__middle_end
        self.__middle_end = all_length - self.__middle_begin
        self.__middle_begin = all_length - tmp
        if self.__mfcc_mask is not None:
            self.__mfcc_mask = self.__mfcc_mask[::-1]
            # equivalent to
            # self.__mfcc_mask_map = ((all_length - 1) - self.__mfcc_mask_map)[::-1]
            # but done in place using NumPy view
            self.__mfcc_mask_map *= -1
            self.__mfcc_mask_map += all_length - 1
            self.__mfcc_mask_map = self.__mfcc_mask_map[::-1]
            self.__speech_intervals = [(all_length - i[1], all_length - i[0])
                                       for i in self.__speech_intervals[::-1]]
            self.__nonspeech_intervals = [
                (all_length - i[1], all_length - i[0])
                for i in self.__nonspeech_intervals[::-1]
            ]
        self.is_reversed = not self.is_reversed
        self.log(u"Reversing...done")

    def run_vad(self):
        """
        Determine which frames contain speech and nonspeech,
        and store the resulting boolean mask internally.
        """
        def _compute_runs(array):
            """
            Compute runs as a list of arrays,
            each containing the indices of a contiguous run.

            :param array: the data array
            :type  array: :class:`numpy.ndarray` (1D)
            :rtype: list of :class:`numpy.ndarray` (1D)
            """
            if len(array) < 1:
                return []
            return numpy.split(array,
                               numpy.where(numpy.diff(array) != 1)[0] + 1)

        self.log(u"Creating VAD object")
        vad = VAD(rconf=self.rconf, logger=self.logger)
        self.log(u"Running VAD...")
        self.__mfcc_mask = vad.run_vad(self.__mfcc[0])
        self.__mfcc_mask_map = (numpy.where(self.__mfcc_mask))[0]
        self.log(u"Running VAD... done")
        self.log(u"Storing speech and nonspeech intervals...")
        # where( == True) already computed, reusing
        # COMMENTED runs = _compute_runs((numpy.where(self.__mfcc_mask))[0])
        runs = _compute_runs(self.__mfcc_mask_map)
        self.__speech_intervals = [(r[0], r[-1]) for r in runs]
        # where( == False) not already computed, computing now
        runs = _compute_runs((numpy.where(~self.__mfcc_mask))[0])
        self.__nonspeech_intervals = [(r[0], r[-1]) for r in runs]
        self.log(u"Storing speech and nonspeech intervals... done")

    def set_head_middle_tail(self,
                             head_length=None,
                             middle_length=None,
                             tail_length=None):
        """
        Set the HEAD, MIDDLE, TAIL explicitly.

        If a parameter is ``None``, it will be ignored.
        If both ``middle_length`` and ``tail_length`` are specified,
        only ``middle_length`` will be applied.

        :param head_length: the length of HEAD, in seconds
        :type  head_length: :class:`~aeneas.timevalue.TimeValue`
        :param middle_length: the length of MIDDLE, in seconds
        :type  middle_length: :class:`~aeneas.timevalue.TimeValue`
        :param tail_length: the length of TAIL, in seconds
        :type  tail_length: :class:`~aeneas.timevalue.TimeValue`
        :raises: TypeError: if one of the arguments is not ``None``
                            or :class:`~aeneas.timevalue.TimeValue`
        :raises: ValueError: if one of the arguments is greater
                             than the length of the audio file
        """
        for variable, name in [(head_length, "head_length"),
                               (middle_length, "middle_length"),
                               (tail_length, "tail_length")]:
            if (variable
                    is not None) and (not isinstance(variable, TimeValue)):
                raise TypeError(u"%s is not None or TimeValue" % name)
            if (variable is not None) and (variable > self.audio_length):
                raise ValueError(
                    u"%s is greater than the length of the audio file" % name)
        self.log(u"Setting head middle tail...")
        mws = self.rconf.mws
        self.log([
            u"Before: 0 %d %d %d", self.middle_begin, self.middle_end,
            self.all_length
        ])
        if head_length is not None:
            self.middle_begin = int(head_length / mws)
        if middle_length is not None:
            self.middle_end = self.middle_begin + int(middle_length / mws)
        elif tail_length is not None:
            self.middle_end = self.all_length - int(tail_length / mws)
        self.log([
            u"After:  0 %d %d %d", self.middle_begin, self.middle_end,
            self.all_length
        ])
        self.log(u"Setting head middle tail... done")
예제 #49
0
    def _cut_head_tail(self, audio_file_path):
        """
        Set the audio file head or tail,
        suitably cutting the audio file on disk,
        and setting the corresponding parameters in the task configuration.

        Return a success bool flag
        """
        self._log("Setting head and/or tail")
        try:
            configuration = self.task.configuration
            head_length = configuration.is_audio_file_head_length
            process_length = configuration.is_audio_file_process_length
            detect_head_min = configuration.is_audio_file_detect_head_min
            detect_head_max = configuration.is_audio_file_detect_head_max
            detect_tail_min = configuration.is_audio_file_detect_tail_min
            detect_tail_max = configuration.is_audio_file_detect_tail_max

            # explicit head or process?
            explicit = (head_length is not None) or (process_length is not None)

            # at least one detect parameter?
            detect = (
                (detect_head_min is not None) or
                (detect_head_max is not None) or
                (detect_tail_min is not None) or
                (detect_tail_max is not None)
            )

            if explicit or detect:
                # we need to load the audio data
                audio_file = AudioFile(audio_file_path, logger=self.logger)
                audio_file.load_data()

                if explicit:
                    self._log("Explicit head or process")
                else:
                    self._log("No explicit head or process => detecting head/tail")

                    head = 0.0
                    if (detect_head_min is not None) or (detect_head_max is not None):
                        self._log("Detecting head...")
                        detect_head_min = gf.safe_float(detect_head_min, gc.SD_MIN_HEAD_LENGTH)
                        detect_head_max = gf.safe_float(detect_head_max, gc.SD_MAX_HEAD_LENGTH)
                        self._log(["detect_head_min is %.3f", detect_head_min])
                        self._log(["detect_head_max is %.3f", detect_head_max])
                        sd = SD(audio_file, self.task.text_file, logger=self.logger)
                        head = sd.detect_head(detect_head_min, detect_head_max)
                        self._log(["Detected head: %.3f", head])

                    tail = 0.0
                    if (detect_tail_min is not None) or (detect_tail_max is not None):
                        self._log("Detecting tail...")
                        detect_tail_max = gf.safe_float(detect_tail_max, gc.SD_MAX_TAIL_LENGTH)
                        detect_tail_min = gf.safe_float(detect_tail_min, gc.SD_MIN_TAIL_LENGTH)
                        self._log(["detect_tail_min is %.3f", detect_tail_min])
                        self._log(["detect_tail_max is %.3f", detect_tail_max])
                        sd = SD(audio_file, self.task.text_file, logger=self.logger)
                        tail = sd.detect_tail(detect_tail_min, detect_tail_max)
                        self._log(["Detected tail: %.3f", tail])

                    # sanity check
                    head_length = max(0, head)
                    process_length = max(0, audio_file.audio_length - tail - head)

                    # we need to set these values
                    # in the config object for later use
                    self.task.configuration.is_audio_file_head_length = head_length
                    self.task.configuration.is_audio_file_process_length = process_length
                    self._log(["Set head_length:    %.3f", head_length])
                    self._log(["Set process_length: %.3f", process_length])

                if head_length is not None:
                    # in case we are reading from config object
                    head_length = float(head_length)
                if process_length is not None:
                    # in case we are reading from config object
                    process_length = float(process_length)
                # note that str() is necessary, as one might be None
                self._log(["is_audio_file_head_length is %s", str(head_length)])
                self._log(["is_audio_file_process_length is %s", str(process_length)])
                self._log("Trimming audio data...")
                audio_file.trim(head_length, process_length)
                self._log("Trimming audio data... done")
                self._log("Writing audio file...")
                audio_file.write(audio_file_path)
                self._log("Writing audio file... done")
                audio_file.clear_data()
            else:
                # nothing to do
                self._log("No explicit head/process or detect head/tail")

            self._log("Setting head and/or tail: succeeded")
            return True
        except Exception as e:
            self._log("Setting head and/or tail: failed")
            self._log(["Message: %s", str(e)])
            return False
예제 #50
0
파일: task.py 프로젝트: shivupoojar/DeFog
class Task(Loggable):
    """
    A structure representing a task, that is,
    an audio file and an ordered set of text fragments
    to be synchronized.

    :param string config_string: the task configuration string
    :param rconf: a runtime configuration
    :type  rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration`
    :param logger: the logger object
    :type  logger: :class:`~aeneas.logger.Logger`
    :raises: TypeError: if ``config_string`` is not ``None`` and
                        it is not a Unicode string
    """

    TAG = u"Task"

    def __init__(self, config_string=None, rconf=None, logger=None):
        super(Task, self).__init__(rconf=rconf, logger=logger)
        self.identifier = gf.uuid_string()
        self.configuration = None
        self.audio_file_path = None  # relative to input container root
        self.audio_file_path_absolute = None  # concrete path, file will be read from this!
        self.audio_file = None
        self.text_file_path = None  # relative to input container root
        self.text_file_path_absolute = None  # concrete path, file will be read from this!
        self.text_file = None
        self.sync_map_file_path = None  # relative to output container root
        self.sync_map_file_path_absolute = None  # concrete path, file will be written to this!
        self.sync_map = None
        if config_string is not None:
            self.configuration = TaskConfiguration(config_string)

    def __unicode__(self):
        msg = [
            u"%s: '%s'" % (gc.RPN_TASK_IDENTIFIER, self.identifier),
            u"Configuration:\n%s" % self.configuration.__unicode__(),
            u"Audio file path: %s" % self.audio_file_path,
            u"Audio file path (absolute): %s" % self.audio_file_path_absolute,
            u"Text file path: %s" % self.text_file_path,
            u"Text file path (absolute): %s" % self.text_file_path_absolute,
            u"Sync map file path: %s" % self.sync_map_file_path,
            u"Sync map file path (absolute): %s" %
            self.sync_map_file_path_absolute
        ]
        return u"\n".join(msg)

    def __str__(self):
        return gf.safe_str(self.__unicode__())

    @property
    def identifier(self):
        """
        The identifier of the task.

        :rtype: string
        """
        return self.__identifier

    @identifier.setter
    def identifier(self, value):
        self.__identifier = value

    @property
    def audio_file_path_absolute(self):
        """
        The absolute path of the audio file.

        :rtype: string
        """
        return self.__audio_file_path_absolute

    @audio_file_path_absolute.setter
    def audio_file_path_absolute(self, audio_file_path_absolute):
        self.__audio_file_path_absolute = audio_file_path_absolute
        self._populate_audio_file()

    @property
    def text_file_path_absolute(self):
        """
        The absolute path of the text file.

        :rtype: string
        """
        return self.__text_file_path_absolute

    @text_file_path_absolute.setter
    def text_file_path_absolute(self, text_file_path_absolute):
        self.__text_file_path_absolute = text_file_path_absolute
        self._populate_text_file()

    @property
    def sync_map_file_path_absolute(self):
        """
        The absolute path of the sync map file.

        :rtype: string
        """
        return self.__sync_map_file_path_absolute

    @sync_map_file_path_absolute.setter
    def sync_map_file_path_absolute(self, sync_map_file_path_absolute):
        self.__sync_map_file_path_absolute = sync_map_file_path_absolute

    def sync_map_leaves(self, fragment_type=None):
        """
        Return the list of non-empty leaves
        in the sync map associated with the task.

        If ``fragment_type`` has been specified,
        return only leaves of that fragment type.

        :param int fragment_type: type of fragment to return
        :rtype: list

        .. versionadded:: 1.7.0
        """
        if (self.sync_map is None) or (self.sync_map.fragments_tree is None):
            return []
        return [f for f in self.sync_map.leaves(fragment_type)]

    def output_sync_map_file(self, container_root_path=None):
        """
        Output the sync map file for this task.

        If ``container_root_path`` is specified,
        the output sync map file will be created
        at the path obtained by joining
        the ``container_root_path`` and the relative path
        of the sync map inside the container.

        Otherwise, the sync map file will be created at the path
        ``self.sync_map_file_path_absolute``.

        Return the the path of the sync map file created,
        or ``None`` if an error occurred.

        :param string container_root_path: the path to the root directory
                                           for the output container
        :rtype: string
        """
        if self.sync_map is None:
            self.log_exc(u"The sync_map object has not been set", None, True,
                         TypeError)

        if (container_root_path
                is not None) and (self.sync_map_file_path is None):
            self.log_exc(u"The (internal) path of the sync map has been set",
                         None, True, TypeError)

        self.log([u"container_root_path is %s", container_root_path])
        self.log([u"self.sync_map_file_path is %s", self.sync_map_file_path])
        self.log([
            u"self.sync_map_file_path_absolute is %s",
            self.sync_map_file_path_absolute
        ])

        if (container_root_path is not None) and (self.sync_map_file_path
                                                  is not None):
            path = os.path.join(container_root_path, self.sync_map_file_path)
        elif self.sync_map_file_path_absolute:
            path = self.sync_map_file_path_absolute
        gf.ensure_parent_directory(path)
        self.log([u"Output sync map to %s", path])

        eaf_audio_ref = self.configuration["o_eaf_audio_ref"]
        head_tail_format = self.configuration["o_h_t_format"]
        levels = self.configuration["o_levels"]
        smil_audio_ref = self.configuration["o_smil_audio_ref"]
        smil_page_ref = self.configuration["o_smil_page_ref"]
        sync_map_format = self.configuration["o_format"]

        self.log([u"eaf_audio_ref is %s", eaf_audio_ref])
        self.log([u"head_tail_format is %s", head_tail_format])
        self.log([u"levels is %s", levels])
        self.log([u"smil_audio_ref is %s", smil_audio_ref])
        self.log([u"smil_page_ref is %s", smil_page_ref])
        self.log([u"sync_map_format is %s", sync_map_format])

        self.log(u"Calling sync_map.write...")
        parameters = {
            gc.PPN_TASK_OS_FILE_EAF_AUDIO_REF: eaf_audio_ref,
            gc.PPN_TASK_OS_FILE_HEAD_TAIL_FORMAT: head_tail_format,
            gc.PPN_TASK_OS_FILE_LEVELS: levels,
            gc.PPN_TASK_OS_FILE_SMIL_AUDIO_REF: smil_audio_ref,
            gc.PPN_TASK_OS_FILE_SMIL_PAGE_REF: smil_page_ref,
        }
        self.sync_map.write(sync_map_format, path, parameters)
        self.log(u"Calling sync_map.write... done")
        return path

    def _populate_audio_file(self):
        """
        Create the ``self.audio_file`` object by reading
        the audio file at ``self.audio_file_path_absolute``.
        """
        self.log(u"Populate audio file...")
        if self.audio_file_path_absolute is not None:
            self.log([
                u"audio_file_path_absolute is '%s'",
                self.audio_file_path_absolute
            ])
            self.audio_file = AudioFile(
                file_path=self.audio_file_path_absolute, logger=self.logger)
            self.audio_file.read_properties()
        else:
            self.log(u"audio_file_path_absolute is None")
        self.log(u"Populate audio file... done")

    def _populate_text_file(self):
        """
        Create the ``self.text_file`` object by reading
        the text file at ``self.text_file_path_absolute``.
        """
        self.log(u"Populate text file...")
        if ((self.text_file_path_absolute is not None)
                and (self.configuration["language"] is not None)):
            # the following values might be None
            parameters = {
                gc.PPN_TASK_IS_TEXT_FILE_IGNORE_REGEX:
                self.configuration["i_t_ignore_regex"],
                gc.PPN_TASK_IS_TEXT_FILE_TRANSLITERATE_MAP:
                self.configuration["i_t_transliterate_map"],
                gc.PPN_TASK_IS_TEXT_MPLAIN_WORD_SEPARATOR:
                self.configuration["i_t_mplain_word_separator"],
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX:
                self.configuration["i_t_munparsed_l1_id_regex"],
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX:
                self.configuration["i_t_munparsed_l2_id_regex"],
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX:
                self.configuration["i_t_munparsed_l3_id_regex"],
                gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX:
                self.configuration["i_t_unparsed_class_regex"],
                gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX:
                self.configuration["i_t_unparsed_id_regex"],
                gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT:
                self.configuration["i_t_unparsed_id_sort"],
                gc.PPN_TASK_OS_FILE_ID_REGEX:
                self.configuration["o_id_regex"]
            }
            self.text_file = TextFile(
                file_path=self.text_file_path_absolute,
                file_format=self.configuration["i_t_format"],
                parameters=parameters,
                logger=self.logger)
            self.text_file.set_language(self.configuration["language"])
        else:
            self.log(u"text_file_path_absolute and/or language is None")
        self.log(u"Populate text file... done")
예제 #51
0
 def load(self, path):
     return AudioFile(get_abs_path(path))
예제 #52
0
    def _synthesize_multiple_generic(self,
                                     helper_function,
                                     text_file,
                                     output_file_path,
                                     quit_after=None,
                                     backwards=False):
        """
        Synthesize multiple fragments, generic function.

        The ``helper_function`` is a function that takes parameters
        ``(text, voice_code, output_file_path)``
        and returns a tuple
        ``(result, (audio_length, audio_sample_rate, audio_format, audio_samples))``.

        :rtype: tuple (result, (anchors, current_time, num_chars))
        """
        self.log(u"Calling TTS engine using multiple generic function...")

        # get sample rate and codec
        self.log(u"Determining codec and sample rate...")
        if (self.OUTPUT_AUDIO_FORMAT is None) or (len(self.OUTPUT_AUDIO_FORMAT)
                                                  != 3):
            self.log(u"Determining codec and sample rate with dummy text...")
            succeeded, data = helper_function(
                text=u"Dummy text to get sample_rate",
                voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE),
                output_file_path=None)
            if not succeeded:
                self.log_crit(
                    u"An unexpected error occurred in helper_function")
                return (False, None)
            du_nu, sample_rate, codec, da_nu = data
            self.log(
                u"Determining codec and sample rate with dummy text... done")
        else:
            self.log(u"Reading codec and sample rate from OUTPUT_AUDIO_FORMAT")
            codec, channels_nu, sample_rate = self.OUTPUT_AUDIO_FORMAT
        self.log(u"Determining codec and sample rate... done")
        self.log([u"  codec:       %s", codec])
        self.log([u"  sample rate: %d", sample_rate])

        # open output file
        output_file = AudioFile(rconf=self.rconf, logger=self.logger)
        output_file.audio_format = codec
        output_file.audio_channels = 1
        output_file.audio_sample_rate = sample_rate

        # create output
        anchors = []
        current_time = TimeValue("0.000")
        num_chars = 0
        fragments = text_file.fragments
        if backwards:
            fragments = fragments[::-1]
        loop_function = self._loop_use_cache if self.use_cache else self._loop_no_cache
        for num, fragment in enumerate(fragments):
            succeeded, data = loop_function(helper_function=helper_function,
                                            num=num,
                                            fragment=fragment)
            if not succeeded:
                self.log_crit(u"An unexpected error occurred in loop_function")
                return (False, None)
            duration, sr_nu, enc_nu, samples = data
            # store for later output
            anchors.append([current_time, fragment.identifier, fragment.text])
            # increase the character counter
            num_chars += fragment.characters
            # concatenate new samples
            self.log([u"Fragment %d starts at: %.3f", num, current_time])
            if duration > 0:
                self.log([u"Fragment %d duration: %.3f", num, duration])
                current_time += duration
                output_file.add_samples(samples, reverse=backwards)
            else:
                self.log([u"Fragment %d has zero duration", num])
            # check if we must stop synthesizing because we have enough audio
            if (quit_after is not None) and (current_time > quit_after):
                self.log(
                    [u"Quitting after reached duration %.3f", current_time])
                break

        # minimize memory
        self.log(u"Minimizing memory...")
        output_file.minimize_memory()
        self.log(u"Minimizing memory... done")

        # if backwards, we need to reverse the audio samples again
        if backwards:
            self.log(u"Reversing audio samples...")
            output_file.reverse()
            self.log(u"Reversing audio samples... done")

        # write output file
        self.log([u"Writing audio file '%s'", output_file_path])
        output_file.write(file_path=output_file_path)

        # return output
        if backwards:
            self.log_warn(
                u"Please note that anchor time values do not make sense since backwards=True"
            )
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine using multiple generic function... done")
        return (True, (anchors, current_time, num_chars))
예제 #53
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 2:
            return self.print_help()
        input_file_path = self.actual_arguments[0]
        output_file_path = self.actual_arguments[1]

        if not self.check_input_file(input_file_path):
            return self.ERROR_EXIT_CODE
        if not self.check_output_file(output_file_path):
            return self.ERROR_EXIT_CODE

        fast = self.has_option("--fast")
        fragment_text = self.has_option("--text")
        h_zoom = gf.safe_int(self.has_option_with_value("--hzoom"), 5)
        label = self.has_option_with_value("--label")
        time_step = gf.safe_int(self.has_option_with_value("--time-step"), 0)
        v_zoom = gf.safe_int(self.has_option_with_value("--vzoom"), 30)

        labels = not self.has_option("--no-labels")
        begin_times = not self.has_option("--no-begin-times")
        end_times = not self.has_option("--no-end-times")
        begin_guides = not self.has_option("--no-begin-guides")
        end_guides = not self.has_option("--no-end-guides")

        try:
            # import or ImportError
            from aeneas.plotter import PlotLabelset
            from aeneas.plotter import PlotTimeScale
            from aeneas.plotter import PlotWaveform
            from aeneas.plotter import Plotter

            # create plotter object
            self.print_info(u"Plotting to file...")
            plotter = Plotter(rconf=self.rconf, logger=self.logger)

            # add waveform
            afm = AudioFile(input_file_path,
                            rconf=self.rconf,
                            logger=self.logger)
            afm.read_samples_from_file()
            plotter.add_waveform(
                PlotWaveform(afm,
                             label=label,
                             fast=fast,
                             rconf=self.rconf,
                             logger=self.logger))

            # add time scale, if requested
            if time_step > 0:
                plotter.add_timescale(
                    PlotTimeScale(afm.audio_length,
                                  time_step=time_step,
                                  rconf=self.rconf,
                                  logger=self.logger))

            # add labelsets, if any
            for i in range(len(self.actual_arguments)):
                if (self.actual_arguments[i]
                        == "-i") and (i + 1 < len(self.actual_arguments)):
                    label_file_path = self.actual_arguments[i + 1]
                    extension = gf.file_extension(label_file_path)
                    if extension == "vad":
                        labelset = self._read_syncmap_file(
                            label_file_path, SyncMapFormat.TSV, False)
                        ls = PlotLabelset(labelset,
                                          parameters=None,
                                          rconf=self.rconf,
                                          logger=self.logger)
                        ls.parameters["labels"] = False
                        ls.parameters["begin_time"] = begin_times
                        ls.parameters["end_time"] = end_times
                        ls.parameters["begin_guide"] = begin_guides
                        ls.parameters["end_guide"] = end_guides
                        plotter.add_labelset(ls)
                    if extension in SyncMapFormat.ALLOWED_VALUES:
                        labelset = self._read_syncmap_file(
                            label_file_path, extension, fragment_text)
                        ls = PlotLabelset(labelset,
                                          parameters=None,
                                          rconf=self.rconf,
                                          logger=self.logger)
                        ls.parameters["labels"] = labels
                        ls.parameters["begin_time"] = begin_times
                        ls.parameters["end_time"] = end_times
                        ls.parameters["begin_guide"] = begin_guides
                        ls.parameters["end_guide"] = end_guides
                        plotter.add_labelset(ls)

            # output to file
            plotter.draw_png(output_file_path, h_zoom=h_zoom, v_zoom=v_zoom)
            self.print_info(u"Plotting to file... done")
            self.print_success(u"Created file '%s'" % output_file_path)
            return self.NO_ERROR_EXIT_CODE
        except ImportError:
            self.print_error(
                u"You need to install Python module Pillow to output image to file. Run:"
            )
            self.print_error(u"$ pip install Pillow")
            self.print_error(u"or, to install for all users:")
            self.print_error(u"$ sudo pip install Pillow")
        except Exception as exc:
            self.print_error(
                u"An unexpected error occurred while generating the image file:"
            )
            self.print_error(u"%s" % exc)

        return self.ERROR_EXIT_CODE
예제 #54
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 2:
            return self.print_help()
        input_file_path = self.actual_arguments[0]
        output_file_path = self.actual_arguments[1]

        if not self.check_input_file(input_file_path):
            return self.ERROR_EXIT_CODE
        if not self.check_output_file(output_file_path):
            return self.ERROR_EXIT_CODE

        fast = self.has_option("--fast")
        fragment_text = self.has_option("--text")
        h_zoom = gf.safe_int(self.has_option_with_value("--hzoom"), 5)
        label = self.has_option_with_value("--label")
        time_step = gf.safe_int(self.has_option_with_value("--time-step"), 0)
        v_zoom = gf.safe_int(self.has_option_with_value("--vzoom"), 30)

        labels = not self.has_option("--no-labels")
        begin_times = not self.has_option("--no-begin-times")
        end_times = not self.has_option("--no-end-times")
        begin_guides = not self.has_option("--no-begin-guides")
        end_guides = not self.has_option("--no-end-guides")

        try:
            # import or ImportError
            from aeneas.plotter import PlotLabelset
            from aeneas.plotter import PlotTimeScale
            from aeneas.plotter import PlotWaveform
            from aeneas.plotter import Plotter

            # create plotter object
            self.print_info(u"Plotting to file...")
            plotter = Plotter(rconf=self.rconf, logger=self.logger)

            # add waveform
            afm = AudioFile(input_file_path, rconf=self.rconf, logger=self.logger)
            afm.read_samples_from_file()
            plotter.add_waveform(PlotWaveform(afm, label=label, fast=fast, rconf=self.rconf, logger=self.logger))

            # add time scale, if requested
            if time_step > 0:
                plotter.add_timescale(PlotTimeScale(afm.audio_length, time_step=time_step, rconf=self.rconf, logger=self.logger))

            # add labelsets, if any
            for i in range(len(self.actual_arguments)):
                if (self.actual_arguments[i] == "-i") and (i + 1 < len(self.actual_arguments)):
                    label_file_path = self.actual_arguments[i+1]
                    extension = gf.file_extension(label_file_path)
                    if extension == "vad":
                        labelset = self._read_syncmap_file(label_file_path, SyncMapFormat.TSV, False)
                        ls = PlotLabelset(labelset, parameters=None, rconf=self.rconf, logger=self.logger)
                        ls.parameters["labels"] = False
                        ls.parameters["begin_time"] = begin_times
                        ls.parameters["end_time"] = end_times
                        ls.parameters["begin_guide"] = begin_guides
                        ls.parameters["end_guide"] = end_guides
                        plotter.add_labelset(ls)
                    if extension in SyncMapFormat.ALLOWED_VALUES:
                        labelset = self._read_syncmap_file(label_file_path, extension, fragment_text)
                        ls = PlotLabelset(labelset, parameters=None, rconf=self.rconf, logger=self.logger)
                        ls.parameters["labels"] = labels
                        ls.parameters["begin_time"] = begin_times
                        ls.parameters["end_time"] = end_times
                        ls.parameters["begin_guide"] = begin_guides
                        ls.parameters["end_guide"] = end_guides
                        plotter.add_labelset(ls)

            # output to file
            plotter.draw_png(output_file_path, h_zoom=h_zoom, v_zoom=v_zoom)
            self.print_info(u"Plotting to file... done")
            self.print_success(u"Created file '%s'" % output_file_path)
            return self.NO_ERROR_EXIT_CODE
        except ImportError:
            self.print_error(u"You need to install Python module Pillow to output image to file. Run:")
            self.print_error(u"$ pip install Pillow")
            self.print_error(u"or, to install for all users:")
            self.print_error(u"$ sudo pip install Pillow")
        except Exception as exc:
            self.print_error(u"An unexpected error occurred while generating the image file:")
            self.print_error(u"%s" % exc)

        return self.ERROR_EXIT_CODE
예제 #55
0
    def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # generating wave data for each fragment,
        # and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = "pcm16"
            output_file.audio_channels = 1
            output_file.audio_sample_rate = self.SAMPLE_RATE

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"Unexpected exception while calling TTS engine via Python", exc, None, type(exc))
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
예제 #56
0
    def _synthesize_single_subprocess(self, text, voice_code, output_file_path):
        """
        Synthesize a single text fragment via ``subprocess``.

        :rtype: tuple (result, (duration, sample_rate, encoding, samples))
        """
        self.log(u"Synthesizing using pure Python...")
        try:
            # if the TTS engine reads text from file,
            # write the text into a temporary file
            if self.CLI_PARAMETER_TEXT_PATH in self.subprocess_arguments:
                self.log(u"TTS engine reads text from file")
                tmp_text_file_handler, tmp_text_file_path = gf.tmp_file(suffix=u".txt", root=self.rconf[RuntimeConfiguration.TMP_PATH])
                self.log([u"Creating temporary text file '%s'...", tmp_text_file_path])
                with io.open(tmp_text_file_path, "w", encoding="utf-8") as tmp_text_file:
                    tmp_text_file.write(text)
                self.log([u"Creating temporary text file '%s'... done", tmp_text_file_path])
            else:
                self.log(u"TTS engine reads text from stdin")
                tmp_text_file_handler = None
                tmp_text_file_path = None

            # copy all relevant arguments
            self.log(u"Creating arguments list...")
            arguments = []
            for arg in self.subprocess_arguments:
                if arg == self.CLI_PARAMETER_VOICE_CODE_FUNCTION:
                    arguments.extend(self._voice_code_to_subprocess(voice_code))
                elif arg == self.CLI_PARAMETER_VOICE_CODE_STRING:
                    arguments.append(voice_code)
                elif arg == self.CLI_PARAMETER_TEXT_PATH:
                    arguments.append(tmp_text_file_path)
                elif arg == self.CLI_PARAMETER_WAVE_PATH:
                    arguments.append(output_file_path)
                elif arg == self.CLI_PARAMETER_TEXT_STDIN:
                    # placeholder, do not append
                    pass
                elif arg == self.CLI_PARAMETER_WAVE_STDOUT:
                    # placeholder, do not append
                    pass
                else:
                    arguments.append(arg)
            self.log(u"Creating arguments list... done")

            # actual call via subprocess
            self.log(u"Calling TTS engine...")
            self.log([u"Calling with arguments '%s'", arguments])
            self.log([u"Calling with text '%s'", text])
            proc = subprocess.Popen(
                arguments,
                stdout=subprocess.PIPE,
                stdin=subprocess.PIPE,
                stderr=subprocess.PIPE,
                universal_newlines=True
            )
            if self.CLI_PARAMETER_TEXT_STDIN in self.subprocess_arguments:
                self.log(u"Passing text via stdin...")
                if gf.PY2:
                    (stdoutdata, stderrdata) = proc.communicate(input=gf.safe_bytes(text))
                else:
                    (stdoutdata, stderrdata) = proc.communicate(input=text)
                self.log(u"Passing text via stdin... done")
            else:
                self.log(u"Passing text via file...")
                (stdoutdata, stderrdata) = proc.communicate()
                self.log(u"Passing text via file... done")
            proc.stdout.close()
            proc.stdin.close()
            proc.stderr.close()

            if self.CLI_PARAMETER_WAVE_STDOUT in self.subprocess_arguments:
                self.log(u"TTS engine wrote audio data to stdout")
                self.log([u"Writing audio data to file '%s'...", output_file_path])
                with io.open(output_file_path, "wb") as output_file:
                    output_file.write(stdoutdata)
                self.log([u"Writing audio data to file '%s'... done", output_file_path])
            else:
                self.log(u"TTS engine wrote audio data to file")

            if tmp_text_file_path is not None:
                self.log([u"Delete temporary text file '%s'", tmp_text_file_path])
                gf.delete_file(tmp_text_file_handler, tmp_text_file_path)

            self.log(u"Calling TTS ... done")
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while calling TTS engine via subprocess", exc, False, None)
            return (False, None)

        # check the file can be read
        if not gf.file_can_be_read(output_file_path):
            self.log_exc(u"Output file '%s' cannot be read" % (output_file_path), None, True, None)
            return (False, None)

        # return the duration of the output file
        try:
            # if we know the TTS outputs to PCM16 mono WAVE,
            # we can read samples directly from it,
            # without an intermediate conversion through ffmpeg
            audio_file = AudioFile(
                file_path=output_file_path,
                is_mono_wave=self.OUTPUT_MONO_WAVE,
                rconf=self.rconf,
                logger=self.logger
            )
            audio_file.read_samples_from_file()
            self.log([u"Duration of '%s': %f", output_file_path, audio_file.audio_length])
            self.log(u"Synthesizing using pure Python... done")
            return (True, (
                audio_file.audio_length,
                audio_file.audio_sample_rate,
                audio_file.audio_format,
                audio_file.audio_samples
            ))
        except (AudioFileUnsupportedFormatError, OSError) as exc:
            self.log_exc(u"An unexpected error occurred while trying to read the sythesized audio file", exc, True, None)
            return (False, None)