Ejemplo n.º 1
0
 def test_preallocate_smaller(self):
     audiofile = AudioFile()
     audiofile.preallocate_memory(100)
     self.assertEqual(len(audiofile.audio_samples), 0)
     audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]))
     self.assertEqual(len(audiofile.audio_samples), 5)
     audiofile.preallocate_memory(2)
     self.assertEqual(len(audiofile.audio_samples), 2)
Ejemplo n.º 2
0
 def test_preallocate_smaller(self):
     audiofile = AudioFile()
     audiofile.preallocate_memory(100)
     self.assertEqual(len(audiofile.audio_samples), 0)
     audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]))
     self.assertEqual(len(audiofile.audio_samples), 5)
     audiofile.preallocate_memory(2)
     self.assertEqual(len(audiofile.audio_samples), 2)
Ejemplo n.º 3
0
 def test_add_samples_reverse_memory(self):
     audiofile = AudioFile()
     audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]), reverse=True)
     audiofile.add_samples(numpy.array([6, 7, 8, 9, 10]), reverse=True)
     self.assertEqual(len(audiofile.audio_samples), 10)
     self.assertEqual(audiofile.audio_samples[0], 5)
     self.assertEqual(audiofile.audio_samples[1], 4)
     self.assertEqual(audiofile.audio_samples[4], 1)
     self.assertEqual(audiofile.audio_samples[5], 10)
     self.assertEqual(audiofile.audio_samples[6], 9)
     self.assertEqual(audiofile.audio_samples[9], 6)
Ejemplo n.º 4
0
 def test_add_samples_memory(self):
     audiofile = AudioFile()
     audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]))
     audiofile.add_samples(numpy.array([6, 7, 8, 9, 10]))
     self.assertEqual(len(audiofile.audio_samples), 10)
     self.assertEqual(audiofile.audio_samples[0], 1)
     self.assertEqual(audiofile.audio_samples[1], 2)
     self.assertEqual(audiofile.audio_samples[4], 5)
     self.assertEqual(audiofile.audio_samples[5], 6)
     self.assertEqual(audiofile.audio_samples[6], 7)
     self.assertEqual(audiofile.audio_samples[9], 10)
Ejemplo n.º 5
0
 def test_add_samples_reverse_memory(self):
     audiofile = AudioFile()
     audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]), reverse=True)
     audiofile.add_samples(numpy.array([6, 7, 8, 9, 10]), reverse=True)
     self.assertEqual(len(audiofile.audio_samples), 10)
     self.assertEqual(audiofile.audio_samples[0], 5)
     self.assertEqual(audiofile.audio_samples[1], 4)
     self.assertEqual(audiofile.audio_samples[4], 1)
     self.assertEqual(audiofile.audio_samples[5], 10)
     self.assertEqual(audiofile.audio_samples[6], 9)
     self.assertEqual(audiofile.audio_samples[9], 6)
Ejemplo n.º 6
0
 def test_add_samples_memory(self):
     audiofile = AudioFile()
     audiofile.add_samples(numpy.array([1, 2, 3, 4, 5]))
     audiofile.add_samples(numpy.array([6, 7, 8, 9, 10]))
     self.assertEqual(len(audiofile.audio_samples), 10)
     self.assertEqual(audiofile.audio_samples[0], 1)
     self.assertEqual(audiofile.audio_samples[1], 2)
     self.assertEqual(audiofile.audio_samples[4], 5)
     self.assertEqual(audiofile.audio_samples[5], 6)
     self.assertEqual(audiofile.audio_samples[6], 7)
     self.assertEqual(audiofile.audio_samples[9], 10)
Ejemplo n.º 7
0
    def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # generating wave data for each fragment,
        # and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = "pcm16"
            output_file.audio_channels = 1
            output_file.audio_sample_rate = self.SAMPLE_RATE

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"Unexpected exception while calling TTS engine via Python", exc, None, type(exc))
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
Ejemplo n.º 8
0
    def _synthesize_multiple_python(self,
                                    text_file,
                                    output_file_path,
                                    quit_after=None,
                                    backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # generating wave data for each fragment,
        # and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = "pcm16"
            output_file.audio_channels = 1
            output_file.audio_sample_rate = self.SAMPLE_RATE

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code)
                # store for later output
                anchors.append(
                    [current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([
                        u"Quitting after reached duration %.3f", current_time
                    ])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(
                u"Unexpected exception while calling TTS engine via Python",
                exc, None, type(exc))
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
Ejemplo n.º 9
0
    def _synthesize_multiple_generic(self,
                                     helper_function,
                                     text_file,
                                     output_file_path,
                                     quit_after=None,
                                     backwards=False):
        """
        Synthesize multiple fragments, generic function.

        The ``helper_function`` is a function that takes parameters
        ``(text, voice_code, output_file_path)``
        and returns a tuple
        ``(result, (audio_length, audio_sample_rate, audio_format, audio_samples))``.

        :rtype: tuple (result, (anchors, current_time, num_chars))
        """
        self.log(u"Calling TTS engine using multiple generic function...")

        # get sample rate and codec
        self.log(u"Determining codec and sample rate...")
        if (self.OUTPUT_AUDIO_FORMAT is None) or (len(self.OUTPUT_AUDIO_FORMAT)
                                                  != 3):
            self.log(u"Determining codec and sample rate with dummy text...")
            succeeded, data = helper_function(
                text=u"Dummy text to get sample_rate",
                voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE),
                output_file_path=None)
            if not succeeded:
                self.log_crit(
                    u"An unexpected error occurred in helper_function")
                return (False, None)
            du_nu, sample_rate, codec, da_nu = data
            self.log(
                u"Determining codec and sample rate with dummy text... done")
        else:
            self.log(u"Reading codec and sample rate from OUTPUT_AUDIO_FORMAT")
            codec, channels_nu, sample_rate = self.OUTPUT_AUDIO_FORMAT
        self.log(u"Determining codec and sample rate... done")
        self.log([u"  codec:       %s", codec])
        self.log([u"  sample rate: %d", sample_rate])

        # open output file
        output_file = AudioFile(rconf=self.rconf, logger=self.logger)
        output_file.audio_format = codec
        output_file.audio_channels = 1
        output_file.audio_sample_rate = sample_rate

        # create output
        anchors = []
        current_time = TimeValue("0.000")
        num_chars = 0
        fragments = text_file.fragments
        if backwards:
            fragments = fragments[::-1]
        loop_function = self._loop_use_cache if self.use_cache else self._loop_no_cache
        for num, fragment in enumerate(fragments):
            succeeded, data = loop_function(helper_function=helper_function,
                                            num=num,
                                            fragment=fragment)
            if not succeeded:
                self.log_crit(u"An unexpected error occurred in loop_function")
                return (False, None)
            duration, sr_nu, enc_nu, samples = data
            # store for later output
            anchors.append([current_time, fragment.identifier, fragment.text])
            # increase the character counter
            num_chars += fragment.characters
            # concatenate new samples
            self.log([u"Fragment %d starts at: %.3f", num, current_time])
            if duration > 0:
                self.log([u"Fragment %d duration: %.3f", num, duration])
                current_time += duration
                output_file.add_samples(samples, reverse=backwards)
            else:
                self.log([u"Fragment %d has zero duration", num])
            # check if we must stop synthesizing because we have enough audio
            if (quit_after is not None) and (current_time > quit_after):
                self.log(
                    [u"Quitting after reached duration %.3f", current_time])
                break

        # minimize memory
        self.log(u"Minimizing memory...")
        output_file.minimize_memory()
        self.log(u"Minimizing memory... done")

        # if backwards, we need to reverse the audio samples again
        if backwards:
            self.log(u"Reversing audio samples...")
            output_file.reverse()
            self.log(u"Reversing audio samples... done")

        # write output file
        self.log([u"Writing audio file '%s'", output_file_path])
        output_file.write(file_path=output_file_path)

        # return output
        if backwards:
            self.log_warn(
                u"Please note that anchor time values do not make sense since backwards=True"
            )
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine using multiple generic function... done")
        return (True, (anchors, current_time, num_chars))
Ejemplo n.º 10
0
    def _synthesize_multiple_subprocess(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple fragments via ``subprocess``.

        :rtype: tuple (result, (anchors, current_time, num_chars))
        """
        def synthesize_and_clean(text, voice_code):
            """
            Synthesize a single fragment via subprocess,
            and immediately remove the temporary file.

            :rtype: tuple (duration, sample_rate, encoding, samples)
            """
            self.log(u"Synthesizing text...")
            handler, tmp_destination = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
            result, data = self._synthesize_single_subprocess(
                text=(text + u" "),
                voice_code=voice_code,
                output_file_path=tmp_destination
            )
            self.log([u"Removing temporary file '%s'", tmp_destination])
            gf.delete_file(handler, tmp_destination)
            self.log(u"Synthesizing text... done")
            return data

        self.log(u"Calling TTS engine via subprocess...")

        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = synthesize_and_clean(
                text=u"Dummy text to get sample_rate",
                voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE)
            )

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, samples = synthesize_and_clean(
                    text=fragment.filtered_text,
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # concatenate new samples
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    output_file.add_samples(samples, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # minimize memory
            self.log(u"Minimizing memory...")
            output_file.minimize_memory()
            self.log(u"Minimizing memory... done")

            # if backwards, we need to reverse the audio samples again
            if backwards:
                self.log(u"Reversing audio samples...")
                output_file.reverse()
                self.log(u"Reversing audio samples... done")

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while calling TTS engine via subprocess", exc, False, None)
            return (False, None)

        # return output
        if backwards:
            self.log_warn(u"Please note that anchor time values do not make sense since backwards=True")
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via subprocess... done")
        return (True, (anchors, current_time, num_chars))
Ejemplo n.º 11
0
    def _synthesize_multiple_python(self,
                                    text_file,
                                    output_file_path,
                                    quit_after=None,
                                    backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # TODO in the Speect Python API I was not able to find a way
        #      to generate the wave incrementally
        #      so I essentially copy the subprocess call mechanism:
        #      generating wave data for each fragment,
        #      and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper(
                text=u"Dummy text to get sample_rate",
                voice_code=self.DEFAULT_LANGUAGE)

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                #
                # NOTE since voice_code is actually ignored
                # in _synthesize_single_helper(),
                # the value of voice_code is irrelevant
                #
                # however, in general you need to apply
                # the _language_to_voice_code() function that maps
                # the text language to a voice code
                #
                # here we apply the _language_to_voice_code() defined in super()
                # that sets voice_code = fragment.language
                #
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code)
                # store for later output
                anchors.append(
                    [current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([
                        u"Quitting after reached duration %.3f", current_time
                    ])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(
                u"An unexpected error occurred while calling TTS engine via Python",
                exc, False, None)
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
Ejemplo n.º 12
0
    def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # TODO in the Speect Python API I was not able to find a way
        #      to generate the wave incrementally
        #      so I essentially copy the subprocess call mechanism:
        #      generating wave data for each fragment,
        #      and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper(
                text=u"Dummy text to get sample_rate",
                voice_code=self.DEFAULT_LANGUAGE
            )

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                #
                # NOTE since voice_code is actually ignored
                # in _synthesize_single_helper(),
                # the value of voice_code is irrelevant
                #
                # however, in general you need to apply
                # the _language_to_voice_code() function that maps
                # the text language to a voice code
                #
                # here we apply the _language_to_voice_code() defined in super()
                # that sets voice_code = fragment.language
                #
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while calling TTS engine via Python", exc, False, None)
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))