Python AudioFile.reverse Examples

Programming Language: Python

Namespace/Package Name: aeneas.audiofile

Class/Type: AudioFile

Method/Function: reverse

Examples at hotexamples.com: 8

Python AudioFile.reverse - 8 examples found. These are the top rated real world Python examples of aeneas.audiofile.AudioFile.reverse extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

AudioFile(28)

read_samples_from_file(8)

add_samples(6)

extract_mfcc(5)

read_properties(5)

clear_data(4)

reverse(4)

write(4)

audio_channels(3)

audio_format(3)

audio_sample_rate(3)

load_data(2)

preallocate_memory(2)

__unicode__(1)

minimize_memory(1)

trim(1)

Example #1

Show file

File: nuancettsapiwrapper.py Project: eomerdws/aeneas

    def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # generating wave data for each fragment,
        # and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = "pcm16"
            output_file.audio_channels = 1
            output_file.audio_sample_rate = self.SAMPLE_RATE

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"Unexpected exception while calling TTS engine via Python", exc, None, type(exc))
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))

Example #2

Show file

    def _detect_start(self,
                      min_start_length,
                      max_start_length,
                      metric,
                      backwards=False):
        """ Detect start """

        self._log(["Min start length: %.3f", min_start_length])
        self._log(["Max start length: %.3f", max_start_length])
        self._log(["Metric:           %s", metric])
        self._log(["Backwards:        %s", str(backwards)])

        audio_rate = self.text_file.characters / self.audio_file.audio_length
        self._log(["Audio rate:     %.3f", audio_rate])

        self._log("Synthesizing query...")
        tmp_handler, tmp_file_path = tempfile.mkstemp(suffix=".wav",
                                                      dir=gf.custom_tmp_dir())
        synt = Synthesizer(logger=self.logger)
        synt_duration = max_start_length * self.QUERY_FACTOR
        self._log(["Synthesizing %.3f seconds", synt_duration])
        result = synt.synthesize(self.text_file,
                                 tmp_file_path,
                                 quit_after=synt_duration,
                                 backwards=backwards)
        self._log("Synthesizing query... done")

        query_file = AudioFile(tmp_file_path)
        if backwards:
            self._log("Reversing query")
            query_file.reverse()
        self._log("Extracting MFCCs for query...")
        query_file.extract_mfcc(frame_rate=self.frame_rate)
        query_file.clear_data()
        self._log("Extracting MFCCs for query... done")

        self._log("Cleaning up...")
        self._cleanup(tmp_handler, tmp_file_path)
        self._log("Cleaning up... done")

        query_characters = result[2]
        query_len = query_file.audio_length
        query_mfcc = query_file.audio_mfcc
        query_rate = query_characters / query_len

        stretch_factor = max(1, query_rate / audio_rate)
        self._log(["Audio rate:     %.3f", audio_rate])
        self._log(["Query rate:     %.3f", query_rate])
        self._log(["Stretch factor: %.3f", stretch_factor])

        audio_mfcc = self.audio_file.audio_mfcc
        self._log(["Actual audio has %d frames", audio_mfcc.shape[1]])
        audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR *
                                   self.frame_rate)
        self._log(["Limiting audio to first %d frames", audio_mfcc_end_index])
        audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1])
        audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index]
        self._log(["Limited audio has %d frames", audio_mfcc.shape[1]])

        l, o = audio_mfcc.shape
        l, n = query_mfcc.shape

        # minimum length of a matched interval in the real audio
        stretched_match_minimum_length = int(n * stretch_factor)

        self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)])
        self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)])
        self._log(["Stretch factor:          %.3f", stretch_factor])
        self._log(
            ["Required minimum length: %.3f", stretched_match_minimum_length])
        self._log("Speech intervals:")
        for interval in self.audio_speech:
            self._log([
                "  %d %d == %.3f %.3f",
                self._t2i(interval[0]),
                self._t2i(interval[1]), interval[0], interval[1]
            ])

        admissible_intervals = [
            x for x in self.audio_speech
            if ((x[0] >= min_start_length) and (x[0] <= max_start_length))
        ]
        self._log("AdmissibleSpeech intervals:")
        for interval in admissible_intervals:
            self._log([
                "  %d %d == %.3f %.3f",
                self._t2i(interval[0]),
                self._t2i(interval[1]), interval[0], interval[1]
            ])

        candidates = []
        runs_with_min_length = 0
        runs_no_improvement = 0
        runs_min_distortion = numpy.inf
        runs_min_value = numpy.inf

        for interval in admissible_intervals:
            if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT:
                self._log("  Breaking: too many runs without improvement")
                break

            if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH:
                self._log(
                    "  Breaking: too many runs with minimum required length")
                break

            start_time = interval[0]
            start_index = self._t2i(start_time)
            self._log([
                "Evaluating interval starting at %d == %.3f ", start_index,
                start_time
            ])
            if start_index > o:
                self._log("  Breaking: start index outside audio window")
                break

            req_end_index = start_index + stretched_match_minimum_length
            req_end_time = self._i2t(req_end_index)
            if req_end_index > o:
                self._log(
                    "  Breaking: not enough audio left in shifted window")
                break
            end_index = min(start_index + 2 * n, o)
            end_time = self._i2t(end_index)

            self._log(["  Start   %d == %.3f", start_index, start_time])
            self._log(["  Req end %d == %.3f", req_end_index, req_end_time])
            self._log(["  Eff end %d == %.3f", end_index, end_time])

            audio_mfcc_sub = audio_mfcc[:, start_index:end_index]
            l, m = audio_mfcc_sub.shape

            self._log("Computing DTW...")
            aligner = DTWAligner(None,
                                 None,
                                 frame_rate=self.frame_rate,
                                 logger=self.logger)
            aligner.real_wave_full_mfcc = audio_mfcc_sub
            aligner.synt_wave_full_mfcc = query_mfcc
            aligner.real_wave_length = self._i2t(m)
            aligner.synt_wave_length = self._i2t(n)
            acm = aligner.compute_accumulated_cost_matrix()
            # transpose, so we have an n x m accumulated cost matrix
            acm = acm.transpose()
            last_row = acm[-1, :]
            self._log("Computing DTW... done")

            # find the minimum, but its index must be >= stretched_match_minimum_length
            candidate_argmin_index = numpy.argmin(
                last_row[stretched_match_minimum_length:])
            candidate_length_index = stretched_match_minimum_length + candidate_argmin_index
            candidate_length_time = self._i2t(candidate_length_index)
            candidate_value = last_row[candidate_length_index]
            candidate_end_index = start_index + candidate_length_index
            candidate_end_time = self._i2t(candidate_end_index)
            candidate_distortion = candidate_value / candidate_length_index

            # check if the candidate has minimum length
            if candidate_length_index == stretched_match_minimum_length:
                runs_with_min_length += 1
            else:
                runs_with_min_length = 0

            # check if the candidate improved the global minimum value
            if metric == SDMetric.VALUE:
                if candidate_value < runs_min_value:
                    runs_min_value = candidate_value
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1
            if metric == SDMetric.DISTORTION:
                if candidate_distortion < runs_min_distortion:
                    runs_min_distortion = candidate_distortion
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1

            # append to the list of candidates
            self._log([
                "    Interval  start:      %d == %.6f", start_index, start_time
            ])
            self._log(
                ["    Interval  end:        %d == %.6f", end_index, end_time])
            self._log([
                "    Candidate start:      %d == %.6f", start_index, start_time
            ])
            self._log([
                "    Candidate end:        %d == %.6f", candidate_end_index,
                candidate_end_time
            ])
            self._log([
                "    Candidate length:     %d == %.6f", candidate_length_index,
                candidate_length_time
            ])
            self._log(["    Candidate value:      %.6f", candidate_value])
            self._log(["    Candidate distortion: %.6f", candidate_distortion])
            candidates.append({
                "start_index": start_index,
                "length": candidate_length_index,
                "value": candidate_value,
                "distortion": candidate_distortion
            })

        # select best candidate and return its start time
        # if we have no best candidate, return 0.0
        best_candidate = self._select_best_candidate(candidates, metric)
        if best_candidate is None:
            return 0.0
        sd_time = self._i2t(max(best_candidate["start_index"], 0))
        self._log(["Returning time %.3f", sd_time])
        return sd_time

Example #3

Show file

    def _synthesize_multiple_python(self,
                                    text_file,
                                    output_file_path,
                                    quit_after=None,
                                    backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # generating wave data for each fragment,
        # and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = "pcm16"
            output_file.audio_channels = 1
            output_file.audio_sample_rate = self.SAMPLE_RATE

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code)
                # store for later output
                anchors.append(
                    [current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([
                        u"Quitting after reached duration %.3f", current_time
                    ])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(
                u"Unexpected exception while calling TTS engine via Python",
                exc, None, type(exc))
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))

Example #4

Show file

File: basettswrapper.py Project: shivupoojar/DeFog

    def _synthesize_multiple_generic(self,
                                     helper_function,
                                     text_file,
                                     output_file_path,
                                     quit_after=None,
                                     backwards=False):
        """
        Synthesize multiple fragments, generic function.

        The ``helper_function`` is a function that takes parameters
        ``(text, voice_code, output_file_path)``
        and returns a tuple
        ``(result, (audio_length, audio_sample_rate, audio_format, audio_samples))``.

        :rtype: tuple (result, (anchors, current_time, num_chars))
        """
        self.log(u"Calling TTS engine using multiple generic function...")

        # get sample rate and codec
        self.log(u"Determining codec and sample rate...")
        if (self.OUTPUT_AUDIO_FORMAT is None) or (len(self.OUTPUT_AUDIO_FORMAT)
                                                  != 3):
            self.log(u"Determining codec and sample rate with dummy text...")
            succeeded, data = helper_function(
                text=u"Dummy text to get sample_rate",
                voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE),
                output_file_path=None)
            if not succeeded:
                self.log_crit(
                    u"An unexpected error occurred in helper_function")
                return (False, None)
            du_nu, sample_rate, codec, da_nu = data
            self.log(
                u"Determining codec and sample rate with dummy text... done")
        else:
            self.log(u"Reading codec and sample rate from OUTPUT_AUDIO_FORMAT")
            codec, channels_nu, sample_rate = self.OUTPUT_AUDIO_FORMAT
        self.log(u"Determining codec and sample rate... done")
        self.log([u"  codec:       %s", codec])
        self.log([u"  sample rate: %d", sample_rate])

        # open output file
        output_file = AudioFile(rconf=self.rconf, logger=self.logger)
        output_file.audio_format = codec
        output_file.audio_channels = 1
        output_file.audio_sample_rate = sample_rate

        # create output
        anchors = []
        current_time = TimeValue("0.000")
        num_chars = 0
        fragments = text_file.fragments
        if backwards:
            fragments = fragments[::-1]
        loop_function = self._loop_use_cache if self.use_cache else self._loop_no_cache
        for num, fragment in enumerate(fragments):
            succeeded, data = loop_function(helper_function=helper_function,
                                            num=num,
                                            fragment=fragment)
            if not succeeded:
                self.log_crit(u"An unexpected error occurred in loop_function")
                return (False, None)
            duration, sr_nu, enc_nu, samples = data
            # store for later output
            anchors.append([current_time, fragment.identifier, fragment.text])
            # increase the character counter
            num_chars += fragment.characters
            # concatenate new samples
            self.log([u"Fragment %d starts at: %.3f", num, current_time])
            if duration > 0:
                self.log([u"Fragment %d duration: %.3f", num, duration])
                current_time += duration
                output_file.add_samples(samples, reverse=backwards)
            else:
                self.log([u"Fragment %d has zero duration", num])
            # check if we must stop synthesizing because we have enough audio
            if (quit_after is not None) and (current_time > quit_after):
                self.log(
                    [u"Quitting after reached duration %.3f", current_time])
                break

        # minimize memory
        self.log(u"Minimizing memory...")
        output_file.minimize_memory()
        self.log(u"Minimizing memory... done")

        # if backwards, we need to reverse the audio samples again
        if backwards:
            self.log(u"Reversing audio samples...")
            output_file.reverse()
            self.log(u"Reversing audio samples... done")

        # write output file
        self.log([u"Writing audio file '%s'", output_file_path])
        output_file.write(file_path=output_file_path)

        # return output
        if backwards:
            self.log_warn(
                u"Please note that anchor time values do not make sense since backwards=True"
            )
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine using multiple generic function... done")
        return (True, (anchors, current_time, num_chars))

Example #5

Show file

File: ttswrapper.py Project: eomerdws/aeneas

    def _synthesize_multiple_subprocess(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple fragments via ``subprocess``.

        :rtype: tuple (result, (anchors, current_time, num_chars))
        """
        def synthesize_and_clean(text, voice_code):
            """
            Synthesize a single fragment via subprocess,
            and immediately remove the temporary file.

            :rtype: tuple (duration, sample_rate, encoding, samples)
            """
            self.log(u"Synthesizing text...")
            handler, tmp_destination = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
            result, data = self._synthesize_single_subprocess(
                text=(text + u" "),
                voice_code=voice_code,
                output_file_path=tmp_destination
            )
            self.log([u"Removing temporary file '%s'", tmp_destination])
            gf.delete_file(handler, tmp_destination)
            self.log(u"Synthesizing text... done")
            return data

        self.log(u"Calling TTS engine via subprocess...")

        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = synthesize_and_clean(
                text=u"Dummy text to get sample_rate",
                voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE)
            )

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, samples = synthesize_and_clean(
                    text=fragment.filtered_text,
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # concatenate new samples
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    output_file.add_samples(samples, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # minimize memory
            self.log(u"Minimizing memory...")
            output_file.minimize_memory()
            self.log(u"Minimizing memory... done")

            # if backwards, we need to reverse the audio samples again
            if backwards:
                self.log(u"Reversing audio samples...")
                output_file.reverse()
                self.log(u"Reversing audio samples... done")

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while calling TTS engine via subprocess", exc, False, None)
            return (False, None)

        # return output
        if backwards:
            self.log_warn(u"Please note that anchor time values do not make sense since backwards=True")
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via subprocess... done")
        return (True, (anchors, current_time, num_chars))

Example #6

Show file

    def _synthesize_multiple_python(self,
                                    text_file,
                                    output_file_path,
                                    quit_after=None,
                                    backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # TODO in the Speect Python API I was not able to find a way
        #      to generate the wave incrementally
        #      so I essentially copy the subprocess call mechanism:
        #      generating wave data for each fragment,
        #      and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper(
                text=u"Dummy text to get sample_rate",
                voice_code=self.DEFAULT_LANGUAGE)

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                #
                # NOTE since voice_code is actually ignored
                # in _synthesize_single_helper(),
                # the value of voice_code is irrelevant
                #
                # however, in general you need to apply
                # the _language_to_voice_code() function that maps
                # the text language to a voice code
                #
                # here we apply the _language_to_voice_code() defined in super()
                # that sets voice_code = fragment.language
                #
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code)
                # store for later output
                anchors.append(
                    [current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([
                        u"Quitting after reached duration %.3f", current_time
                    ])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(
                u"An unexpected error occurred while calling TTS engine via Python",
                exc, False, None)
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))

Example #7

Show file

File: ctw_speect.py Project: eomerdws/aeneas

    def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # TODO in the Speect Python API I was not able to find a way
        #      to generate the wave incrementally
        #      so I essentially copy the subprocess call mechanism:
        #      generating wave data for each fragment,
        #      and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper(
                text=u"Dummy text to get sample_rate",
                voice_code=self.DEFAULT_LANGUAGE
            )

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                #
                # NOTE since voice_code is actually ignored
                # in _synthesize_single_helper(),
                # the value of voice_code is irrelevant
                #
                # however, in general you need to apply
                # the _language_to_voice_code() function that maps
                # the text language to a voice code
                #
                # here we apply the _language_to_voice_code() defined in super()
                # that sets voice_code = fragment.language
                #
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while calling TTS engine via Python", exc, False, None)
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))

Example #8

Show file

File: sd.py Project: cambell-prince/aeneas

    def _detect_start(self, min_start_length, max_start_length, metric, backwards=False):
        """ Detect start """

        self._log(["Min start length: %.3f", min_start_length])
        self._log(["Max start length: %.3f", max_start_length])
        self._log(["Metric:           %s", metric])
        self._log(["Backwards:        %s", str(backwards)])

        audio_rate = self.text_file.characters / self.audio_file.audio_length
        self._log(["Audio rate:     %.3f", audio_rate])

        self._log("Synthesizing query...")
        tmp_handler, tmp_file_path = tempfile.mkstemp(
            suffix=".wav",
            dir=gf.custom_tmp_dir()
        )
        synt = Synthesizer(logger=self.logger)
        synt_duration = max_start_length * self.QUERY_FACTOR
        self._log(["Synthesizing %.3f seconds", synt_duration])
        result = synt.synthesize(
            self.text_file,
            tmp_file_path,
            quit_after=synt_duration,
            backwards=backwards
        )
        self._log("Synthesizing query... done")

        query_file = AudioFile(tmp_file_path)
        if backwards:
            self._log("Reversing query")
            query_file.reverse()
        self._log("Extracting MFCCs for query...")
        query_file.extract_mfcc(frame_rate=self.frame_rate)
        query_file.clear_data()
        self._log("Extracting MFCCs for query... done")

        self._log("Cleaning up...")
        self._cleanup(tmp_handler, tmp_file_path)
        self._log("Cleaning up... done")

        query_characters = result[2]
        query_len = query_file.audio_length
        query_mfcc = query_file.audio_mfcc
        query_rate = query_characters / query_len

        stretch_factor = max(1, query_rate / audio_rate)
        self._log(["Audio rate:     %.3f", audio_rate])
        self._log(["Query rate:     %.3f", query_rate])
        self._log(["Stretch factor: %.3f", stretch_factor])

        audio_mfcc = self.audio_file.audio_mfcc
        self._log(["Actual audio has %d frames", audio_mfcc.shape[1]])
        audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate)
        self._log(["Limiting audio to first %d frames", audio_mfcc_end_index])
        audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1])
        audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index]
        self._log(["Limited audio has %d frames", audio_mfcc.shape[1]])

        l, o = audio_mfcc.shape
        l, n = query_mfcc.shape

        # minimum length of a matched interval in the real audio
        stretched_match_minimum_length = int(n * stretch_factor)

        self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)])
        self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)])
        self._log(["Stretch factor:          %.3f", stretch_factor])
        self._log(["Required minimum length: %.3f", stretched_match_minimum_length])
        self._log("Speech intervals:")
        for interval in self.audio_speech:
            self._log(["  %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]])

        admissible_intervals = [x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length))]
        self._log("AdmissibleSpeech intervals:")
        for interval in admissible_intervals:
            self._log(["  %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]])

        candidates = []
        runs_with_min_length = 0
        runs_no_improvement = 0
        runs_min_distortion = numpy.inf
        runs_min_value = numpy.inf

        for interval in admissible_intervals:
            if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT:
                self._log("  Breaking: too many runs without improvement")
                break

            if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH:
                self._log("  Breaking: too many runs with minimum required length")
                break

            start_time = interval[0]
            start_index = self._t2i(start_time)
            self._log(["Evaluating interval starting at %d == %.3f ", start_index, start_time])
            if start_index > o:
                self._log("  Breaking: start index outside audio window")
                break

            req_end_index = start_index + stretched_match_minimum_length
            req_end_time = self._i2t(req_end_index)
            if req_end_index > o:
                self._log("  Breaking: not enough audio left in shifted window")
                break
            end_index = min(start_index + 2 * n, o)
            end_time = self._i2t(end_index)

            self._log(["  Start   %d == %.3f", start_index, start_time])
            self._log(["  Req end %d == %.3f", req_end_index, req_end_time])
            self._log(["  Eff end %d == %.3f", end_index, end_time])

            audio_mfcc_sub = audio_mfcc[:, start_index:end_index]
            l, m = audio_mfcc_sub.shape

            self._log("Computing DTW...")
            aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger)
            aligner.real_wave_full_mfcc = audio_mfcc_sub
            aligner.synt_wave_full_mfcc = query_mfcc
            aligner.real_wave_length = self._i2t(m)
            aligner.synt_wave_length = self._i2t(n)
            acm = aligner.compute_accumulated_cost_matrix()
            # transpose, so we have an n x m accumulated cost matrix
            acm = acm.transpose()
            last_row = acm[-1, :]
            self._log("Computing DTW... done")

            # find the minimum, but its index must be >= stretched_match_minimum_length
            candidate_argmin_index = numpy.argmin(last_row[stretched_match_minimum_length:])
            candidate_length_index = stretched_match_minimum_length + candidate_argmin_index
            candidate_length_time = self._i2t(candidate_length_index)
            candidate_value = last_row[candidate_length_index]
            candidate_end_index = start_index + candidate_length_index
            candidate_end_time = self._i2t(candidate_end_index)
            candidate_distortion = candidate_value / candidate_length_index

            # check if the candidate has minimum length
            if candidate_length_index == stretched_match_minimum_length:
                runs_with_min_length += 1
            else:
                runs_with_min_length = 0

            # check if the candidate improved the global minimum value
            if metric == SDMetric.VALUE:
                if candidate_value < runs_min_value:
                    runs_min_value = candidate_value
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1
            if metric == SDMetric.DISTORTION:
                if candidate_distortion < runs_min_distortion:
                    runs_min_distortion = candidate_distortion
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1

            # append to the list of candidates
            self._log(["    Interval  start:      %d == %.6f", start_index, start_time])
            self._log(["    Interval  end:        %d == %.6f", end_index, end_time])
            self._log(["    Candidate start:      %d == %.6f", start_index, start_time])
            self._log(["    Candidate end:        %d == %.6f", candidate_end_index, candidate_end_time])
            self._log(["    Candidate length:     %d == %.6f", candidate_length_index, candidate_length_time])
            self._log(["    Candidate value:      %.6f", candidate_value])
            self._log(["    Candidate distortion: %.6f", candidate_distortion])
            candidates.append({
                "start_index": start_index,
                "length": candidate_length_index,
                "value": candidate_value,
                "distortion": candidate_distortion
            })

        # select best candidate and return its start time
        # if we have no best candidate, return 0.0
        best_candidate = self._select_best_candidate(candidates, metric)
        if best_candidate is None:
            return 0.0
        sd_time = self._i2t(max(best_candidate["start_index"], 0))
        self._log(["Returning time %.3f", sd_time])
        return sd_time