Example #1
0
    def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # generating wave data for each fragment,
        # and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = "pcm16"
            output_file.audio_channels = 1
            output_file.audio_sample_rate = self.SAMPLE_RATE

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"Unexpected exception while calling TTS engine via Python", exc, None, type(exc))
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
Example #2
0
    def _detect_start(self,
                      min_start_length,
                      max_start_length,
                      metric,
                      backwards=False):
        """ Detect start """

        self._log(["Min start length: %.3f", min_start_length])
        self._log(["Max start length: %.3f", max_start_length])
        self._log(["Metric:           %s", metric])
        self._log(["Backwards:        %s", str(backwards)])

        audio_rate = self.text_file.characters / self.audio_file.audio_length
        self._log(["Audio rate:     %.3f", audio_rate])

        self._log("Synthesizing query...")
        tmp_handler, tmp_file_path = tempfile.mkstemp(suffix=".wav",
                                                      dir=gf.custom_tmp_dir())
        synt = Synthesizer(logger=self.logger)
        synt_duration = max_start_length * self.QUERY_FACTOR
        self._log(["Synthesizing %.3f seconds", synt_duration])
        result = synt.synthesize(self.text_file,
                                 tmp_file_path,
                                 quit_after=synt_duration,
                                 backwards=backwards)
        self._log("Synthesizing query... done")

        query_file = AudioFile(tmp_file_path)
        if backwards:
            self._log("Reversing query")
            query_file.reverse()
        self._log("Extracting MFCCs for query...")
        query_file.extract_mfcc(frame_rate=self.frame_rate)
        query_file.clear_data()
        self._log("Extracting MFCCs for query... done")

        self._log("Cleaning up...")
        self._cleanup(tmp_handler, tmp_file_path)
        self._log("Cleaning up... done")

        query_characters = result[2]
        query_len = query_file.audio_length
        query_mfcc = query_file.audio_mfcc
        query_rate = query_characters / query_len

        stretch_factor = max(1, query_rate / audio_rate)
        self._log(["Audio rate:     %.3f", audio_rate])
        self._log(["Query rate:     %.3f", query_rate])
        self._log(["Stretch factor: %.3f", stretch_factor])

        audio_mfcc = self.audio_file.audio_mfcc
        self._log(["Actual audio has %d frames", audio_mfcc.shape[1]])
        audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR *
                                   self.frame_rate)
        self._log(["Limiting audio to first %d frames", audio_mfcc_end_index])
        audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1])
        audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index]
        self._log(["Limited audio has %d frames", audio_mfcc.shape[1]])

        l, o = audio_mfcc.shape
        l, n = query_mfcc.shape

        # minimum length of a matched interval in the real audio
        stretched_match_minimum_length = int(n * stretch_factor)

        self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)])
        self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)])
        self._log(["Stretch factor:          %.3f", stretch_factor])
        self._log(
            ["Required minimum length: %.3f", stretched_match_minimum_length])
        self._log("Speech intervals:")
        for interval in self.audio_speech:
            self._log([
                "  %d %d == %.3f %.3f",
                self._t2i(interval[0]),
                self._t2i(interval[1]), interval[0], interval[1]
            ])

        admissible_intervals = [
            x for x in self.audio_speech
            if ((x[0] >= min_start_length) and (x[0] <= max_start_length))
        ]
        self._log("AdmissibleSpeech intervals:")
        for interval in admissible_intervals:
            self._log([
                "  %d %d == %.3f %.3f",
                self._t2i(interval[0]),
                self._t2i(interval[1]), interval[0], interval[1]
            ])

        candidates = []
        runs_with_min_length = 0
        runs_no_improvement = 0
        runs_min_distortion = numpy.inf
        runs_min_value = numpy.inf

        for interval in admissible_intervals:
            if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT:
                self._log("  Breaking: too many runs without improvement")
                break

            if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH:
                self._log(
                    "  Breaking: too many runs with minimum required length")
                break

            start_time = interval[0]
            start_index = self._t2i(start_time)
            self._log([
                "Evaluating interval starting at %d == %.3f ", start_index,
                start_time
            ])
            if start_index > o:
                self._log("  Breaking: start index outside audio window")
                break

            req_end_index = start_index + stretched_match_minimum_length
            req_end_time = self._i2t(req_end_index)
            if req_end_index > o:
                self._log(
                    "  Breaking: not enough audio left in shifted window")
                break
            end_index = min(start_index + 2 * n, o)
            end_time = self._i2t(end_index)

            self._log(["  Start   %d == %.3f", start_index, start_time])
            self._log(["  Req end %d == %.3f", req_end_index, req_end_time])
            self._log(["  Eff end %d == %.3f", end_index, end_time])

            audio_mfcc_sub = audio_mfcc[:, start_index:end_index]
            l, m = audio_mfcc_sub.shape

            self._log("Computing DTW...")
            aligner = DTWAligner(None,
                                 None,
                                 frame_rate=self.frame_rate,
                                 logger=self.logger)
            aligner.real_wave_full_mfcc = audio_mfcc_sub
            aligner.synt_wave_full_mfcc = query_mfcc
            aligner.real_wave_length = self._i2t(m)
            aligner.synt_wave_length = self._i2t(n)
            acm = aligner.compute_accumulated_cost_matrix()
            # transpose, so we have an n x m accumulated cost matrix
            acm = acm.transpose()
            last_row = acm[-1, :]
            self._log("Computing DTW... done")

            # find the minimum, but its index must be >= stretched_match_minimum_length
            candidate_argmin_index = numpy.argmin(
                last_row[stretched_match_minimum_length:])
            candidate_length_index = stretched_match_minimum_length + candidate_argmin_index
            candidate_length_time = self._i2t(candidate_length_index)
            candidate_value = last_row[candidate_length_index]
            candidate_end_index = start_index + candidate_length_index
            candidate_end_time = self._i2t(candidate_end_index)
            candidate_distortion = candidate_value / candidate_length_index

            # check if the candidate has minimum length
            if candidate_length_index == stretched_match_minimum_length:
                runs_with_min_length += 1
            else:
                runs_with_min_length = 0

            # check if the candidate improved the global minimum value
            if metric == SDMetric.VALUE:
                if candidate_value < runs_min_value:
                    runs_min_value = candidate_value
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1
            if metric == SDMetric.DISTORTION:
                if candidate_distortion < runs_min_distortion:
                    runs_min_distortion = candidate_distortion
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1

            # append to the list of candidates
            self._log([
                "    Interval  start:      %d == %.6f", start_index, start_time
            ])
            self._log(
                ["    Interval  end:        %d == %.6f", end_index, end_time])
            self._log([
                "    Candidate start:      %d == %.6f", start_index, start_time
            ])
            self._log([
                "    Candidate end:        %d == %.6f", candidate_end_index,
                candidate_end_time
            ])
            self._log([
                "    Candidate length:     %d == %.6f", candidate_length_index,
                candidate_length_time
            ])
            self._log(["    Candidate value:      %.6f", candidate_value])
            self._log(["    Candidate distortion: %.6f", candidate_distortion])
            candidates.append({
                "start_index": start_index,
                "length": candidate_length_index,
                "value": candidate_value,
                "distortion": candidate_distortion
            })

        # select best candidate and return its start time
        # if we have no best candidate, return 0.0
        best_candidate = self._select_best_candidate(candidates, metric)
        if best_candidate is None:
            return 0.0
        sd_time = self._i2t(max(best_candidate["start_index"], 0))
        self._log(["Returning time %.3f", sd_time])
        return sd_time
Example #3
0
    def _synthesize_multiple_python(self,
                                    text_file,
                                    output_file_path,
                                    quit_after=None,
                                    backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # generating wave data for each fragment,
        # and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = "pcm16"
            output_file.audio_channels = 1
            output_file.audio_sample_rate = self.SAMPLE_RATE

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code)
                # store for later output
                anchors.append(
                    [current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([
                        u"Quitting after reached duration %.3f", current_time
                    ])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(
                u"Unexpected exception while calling TTS engine via Python",
                exc, None, type(exc))
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
Example #4
0
    def _synthesize_multiple_generic(self,
                                     helper_function,
                                     text_file,
                                     output_file_path,
                                     quit_after=None,
                                     backwards=False):
        """
        Synthesize multiple fragments, generic function.

        The ``helper_function`` is a function that takes parameters
        ``(text, voice_code, output_file_path)``
        and returns a tuple
        ``(result, (audio_length, audio_sample_rate, audio_format, audio_samples))``.

        :rtype: tuple (result, (anchors, current_time, num_chars))
        """
        self.log(u"Calling TTS engine using multiple generic function...")

        # get sample rate and codec
        self.log(u"Determining codec and sample rate...")
        if (self.OUTPUT_AUDIO_FORMAT is None) or (len(self.OUTPUT_AUDIO_FORMAT)
                                                  != 3):
            self.log(u"Determining codec and sample rate with dummy text...")
            succeeded, data = helper_function(
                text=u"Dummy text to get sample_rate",
                voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE),
                output_file_path=None)
            if not succeeded:
                self.log_crit(
                    u"An unexpected error occurred in helper_function")
                return (False, None)
            du_nu, sample_rate, codec, da_nu = data
            self.log(
                u"Determining codec and sample rate with dummy text... done")
        else:
            self.log(u"Reading codec and sample rate from OUTPUT_AUDIO_FORMAT")
            codec, channels_nu, sample_rate = self.OUTPUT_AUDIO_FORMAT
        self.log(u"Determining codec and sample rate... done")
        self.log([u"  codec:       %s", codec])
        self.log([u"  sample rate: %d", sample_rate])

        # open output file
        output_file = AudioFile(rconf=self.rconf, logger=self.logger)
        output_file.audio_format = codec
        output_file.audio_channels = 1
        output_file.audio_sample_rate = sample_rate

        # create output
        anchors = []
        current_time = TimeValue("0.000")
        num_chars = 0
        fragments = text_file.fragments
        if backwards:
            fragments = fragments[::-1]
        loop_function = self._loop_use_cache if self.use_cache else self._loop_no_cache
        for num, fragment in enumerate(fragments):
            succeeded, data = loop_function(helper_function=helper_function,
                                            num=num,
                                            fragment=fragment)
            if not succeeded:
                self.log_crit(u"An unexpected error occurred in loop_function")
                return (False, None)
            duration, sr_nu, enc_nu, samples = data
            # store for later output
            anchors.append([current_time, fragment.identifier, fragment.text])
            # increase the character counter
            num_chars += fragment.characters
            # concatenate new samples
            self.log([u"Fragment %d starts at: %.3f", num, current_time])
            if duration > 0:
                self.log([u"Fragment %d duration: %.3f", num, duration])
                current_time += duration
                output_file.add_samples(samples, reverse=backwards)
            else:
                self.log([u"Fragment %d has zero duration", num])
            # check if we must stop synthesizing because we have enough audio
            if (quit_after is not None) and (current_time > quit_after):
                self.log(
                    [u"Quitting after reached duration %.3f", current_time])
                break

        # minimize memory
        self.log(u"Minimizing memory...")
        output_file.minimize_memory()
        self.log(u"Minimizing memory... done")

        # if backwards, we need to reverse the audio samples again
        if backwards:
            self.log(u"Reversing audio samples...")
            output_file.reverse()
            self.log(u"Reversing audio samples... done")

        # write output file
        self.log([u"Writing audio file '%s'", output_file_path])
        output_file.write(file_path=output_file_path)

        # return output
        if backwards:
            self.log_warn(
                u"Please note that anchor time values do not make sense since backwards=True"
            )
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine using multiple generic function... done")
        return (True, (anchors, current_time, num_chars))
Example #5
0
    def _synthesize_multiple_subprocess(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple fragments via ``subprocess``.

        :rtype: tuple (result, (anchors, current_time, num_chars))
        """
        def synthesize_and_clean(text, voice_code):
            """
            Synthesize a single fragment via subprocess,
            and immediately remove the temporary file.

            :rtype: tuple (duration, sample_rate, encoding, samples)
            """
            self.log(u"Synthesizing text...")
            handler, tmp_destination = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
            result, data = self._synthesize_single_subprocess(
                text=(text + u" "),
                voice_code=voice_code,
                output_file_path=tmp_destination
            )
            self.log([u"Removing temporary file '%s'", tmp_destination])
            gf.delete_file(handler, tmp_destination)
            self.log(u"Synthesizing text... done")
            return data

        self.log(u"Calling TTS engine via subprocess...")

        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = synthesize_and_clean(
                text=u"Dummy text to get sample_rate",
                voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE)
            )

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, samples = synthesize_and_clean(
                    text=fragment.filtered_text,
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # concatenate new samples
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    output_file.add_samples(samples, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # minimize memory
            self.log(u"Minimizing memory...")
            output_file.minimize_memory()
            self.log(u"Minimizing memory... done")

            # if backwards, we need to reverse the audio samples again
            if backwards:
                self.log(u"Reversing audio samples...")
                output_file.reverse()
                self.log(u"Reversing audio samples... done")

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while calling TTS engine via subprocess", exc, False, None)
            return (False, None)

        # return output
        if backwards:
            self.log_warn(u"Please note that anchor time values do not make sense since backwards=True")
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via subprocess... done")
        return (True, (anchors, current_time, num_chars))
Example #6
0
    def _synthesize_multiple_python(self,
                                    text_file,
                                    output_file_path,
                                    quit_after=None,
                                    backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # TODO in the Speect Python API I was not able to find a way
        #      to generate the wave incrementally
        #      so I essentially copy the subprocess call mechanism:
        #      generating wave data for each fragment,
        #      and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper(
                text=u"Dummy text to get sample_rate",
                voice_code=self.DEFAULT_LANGUAGE)

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                #
                # NOTE since voice_code is actually ignored
                # in _synthesize_single_helper(),
                # the value of voice_code is irrelevant
                #
                # however, in general you need to apply
                # the _language_to_voice_code() function that maps
                # the text language to a voice code
                #
                # here we apply the _language_to_voice_code() defined in super()
                # that sets voice_code = fragment.language
                #
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code)
                # store for later output
                anchors.append(
                    [current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([
                        u"Quitting after reached duration %.3f", current_time
                    ])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(
                u"An unexpected error occurred while calling TTS engine via Python",
                exc, False, None)
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
Example #7
0
    def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # TODO in the Speect Python API I was not able to find a way
        #      to generate the wave incrementally
        #      so I essentially copy the subprocess call mechanism:
        #      generating wave data for each fragment,
        #      and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper(
                text=u"Dummy text to get sample_rate",
                voice_code=self.DEFAULT_LANGUAGE
            )

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                #
                # NOTE since voice_code is actually ignored
                # in _synthesize_single_helper(),
                # the value of voice_code is irrelevant
                #
                # however, in general you need to apply
                # the _language_to_voice_code() function that maps
                # the text language to a voice code
                #
                # here we apply the _language_to_voice_code() defined in super()
                # that sets voice_code = fragment.language
                #
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while calling TTS engine via Python", exc, False, None)
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
Example #8
0
    def _detect_start(self, min_start_length, max_start_length, metric, backwards=False):
        """ Detect start """

        self._log(["Min start length: %.3f", min_start_length])
        self._log(["Max start length: %.3f", max_start_length])
        self._log(["Metric:           %s", metric])
        self._log(["Backwards:        %s", str(backwards)])

        audio_rate = self.text_file.characters / self.audio_file.audio_length
        self._log(["Audio rate:     %.3f", audio_rate])

        self._log("Synthesizing query...")
        tmp_handler, tmp_file_path = tempfile.mkstemp(
            suffix=".wav",
            dir=gf.custom_tmp_dir()
        )
        synt = Synthesizer(logger=self.logger)
        synt_duration = max_start_length * self.QUERY_FACTOR
        self._log(["Synthesizing %.3f seconds", synt_duration])
        result = synt.synthesize(
            self.text_file,
            tmp_file_path,
            quit_after=synt_duration,
            backwards=backwards
        )
        self._log("Synthesizing query... done")

        query_file = AudioFile(tmp_file_path)
        if backwards:
            self._log("Reversing query")
            query_file.reverse()
        self._log("Extracting MFCCs for query...")
        query_file.extract_mfcc(frame_rate=self.frame_rate)
        query_file.clear_data()
        self._log("Extracting MFCCs for query... done")

        self._log("Cleaning up...")
        self._cleanup(tmp_handler, tmp_file_path)
        self._log("Cleaning up... done")

        query_characters = result[2]
        query_len = query_file.audio_length
        query_mfcc = query_file.audio_mfcc
        query_rate = query_characters / query_len

        stretch_factor = max(1, query_rate / audio_rate)
        self._log(["Audio rate:     %.3f", audio_rate])
        self._log(["Query rate:     %.3f", query_rate])
        self._log(["Stretch factor: %.3f", stretch_factor])

        audio_mfcc = self.audio_file.audio_mfcc
        self._log(["Actual audio has %d frames", audio_mfcc.shape[1]])
        audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate)
        self._log(["Limiting audio to first %d frames", audio_mfcc_end_index])
        audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1])
        audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index]
        self._log(["Limited audio has %d frames", audio_mfcc.shape[1]])

        l, o = audio_mfcc.shape
        l, n = query_mfcc.shape

        # minimum length of a matched interval in the real audio
        stretched_match_minimum_length = int(n * stretch_factor)

        self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)])
        self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)])
        self._log(["Stretch factor:          %.3f", stretch_factor])
        self._log(["Required minimum length: %.3f", stretched_match_minimum_length])
        self._log("Speech intervals:")
        for interval in self.audio_speech:
            self._log(["  %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]])

        admissible_intervals = [x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length))]
        self._log("AdmissibleSpeech intervals:")
        for interval in admissible_intervals:
            self._log(["  %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]])

        candidates = []
        runs_with_min_length = 0
        runs_no_improvement = 0
        runs_min_distortion = numpy.inf
        runs_min_value = numpy.inf

        for interval in admissible_intervals:
            if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT:
                self._log("  Breaking: too many runs without improvement")
                break

            if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH:
                self._log("  Breaking: too many runs with minimum required length")
                break

            start_time = interval[0]
            start_index = self._t2i(start_time)
            self._log(["Evaluating interval starting at %d == %.3f ", start_index, start_time])
            if start_index > o:
                self._log("  Breaking: start index outside audio window")
                break

            req_end_index = start_index + stretched_match_minimum_length
            req_end_time = self._i2t(req_end_index)
            if req_end_index > o:
                self._log("  Breaking: not enough audio left in shifted window")
                break
            end_index = min(start_index + 2 * n, o)
            end_time = self._i2t(end_index)

            self._log(["  Start   %d == %.3f", start_index, start_time])
            self._log(["  Req end %d == %.3f", req_end_index, req_end_time])
            self._log(["  Eff end %d == %.3f", end_index, end_time])

            audio_mfcc_sub = audio_mfcc[:, start_index:end_index]
            l, m = audio_mfcc_sub.shape

            self._log("Computing DTW...")
            aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger)
            aligner.real_wave_full_mfcc = audio_mfcc_sub
            aligner.synt_wave_full_mfcc = query_mfcc
            aligner.real_wave_length = self._i2t(m)
            aligner.synt_wave_length = self._i2t(n)
            acm = aligner.compute_accumulated_cost_matrix()
            # transpose, so we have an n x m accumulated cost matrix
            acm = acm.transpose()
            last_row = acm[-1, :]
            self._log("Computing DTW... done")

            # find the minimum, but its index must be >= stretched_match_minimum_length
            candidate_argmin_index = numpy.argmin(last_row[stretched_match_minimum_length:])
            candidate_length_index = stretched_match_minimum_length + candidate_argmin_index
            candidate_length_time = self._i2t(candidate_length_index)
            candidate_value = last_row[candidate_length_index]
            candidate_end_index = start_index + candidate_length_index
            candidate_end_time = self._i2t(candidate_end_index)
            candidate_distortion = candidate_value / candidate_length_index

            # check if the candidate has minimum length
            if candidate_length_index == stretched_match_minimum_length:
                runs_with_min_length += 1
            else:
                runs_with_min_length = 0

            # check if the candidate improved the global minimum value
            if metric == SDMetric.VALUE:
                if candidate_value < runs_min_value:
                    runs_min_value = candidate_value
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1
            if metric == SDMetric.DISTORTION:
                if candidate_distortion < runs_min_distortion:
                    runs_min_distortion = candidate_distortion
                    runs_no_improvement = 0
                else:
                    runs_no_improvement += 1

            # append to the list of candidates
            self._log(["    Interval  start:      %d == %.6f", start_index, start_time])
            self._log(["    Interval  end:        %d == %.6f", end_index, end_time])
            self._log(["    Candidate start:      %d == %.6f", start_index, start_time])
            self._log(["    Candidate end:        %d == %.6f", candidate_end_index, candidate_end_time])
            self._log(["    Candidate length:     %d == %.6f", candidate_length_index, candidate_length_time])
            self._log(["    Candidate value:      %.6f", candidate_value])
            self._log(["    Candidate distortion: %.6f", candidate_distortion])
            candidates.append({
                "start_index": start_index,
                "length": candidate_length_index,
                "value": candidate_value,
                "distortion": candidate_distortion
            })

        # select best candidate and return its start time
        # if we have no best candidate, return 0.0
        best_candidate = self._select_best_candidate(candidates, metric)
        if best_candidate is None:
            return 0.0
        sd_time = self._i2t(max(best_candidate["start_index"], 0))
        self._log(["Returning time %.3f", sd_time])
        return sd_time