Ejemplo n.º 1
0
    def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # generating wave data for each fragment,
        # and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = "pcm16"
            output_file.audio_channels = 1
            output_file.audio_sample_rate = self.SAMPLE_RATE

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"Unexpected exception while calling TTS engine via Python", exc, None, type(exc))
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
Ejemplo n.º 2
0
    def _cut_head_tail(self, audio_file_path):
        """
        Set the audio file head or tail,
        suitably cutting the audio file on disk,
        and setting the corresponding parameters in the task configuration.

        Return a success bool flag
        """
        self._log("Setting head and/or tail")
        try:
            configuration = self.task.configuration
            head_length = configuration.is_audio_file_head_length
            process_length = configuration.is_audio_file_process_length
            detect_head_min = configuration.is_audio_file_detect_head_min
            detect_head_max = configuration.is_audio_file_detect_head_max
            detect_tail_min = configuration.is_audio_file_detect_tail_min
            detect_tail_max = configuration.is_audio_file_detect_tail_max

            # explicit head or process?
            explicit = (head_length is not None) or (process_length
                                                     is not None)

            # at least one detect parameter?
            detect = ((detect_head_min is not None)
                      or (detect_head_max is not None)
                      or (detect_tail_min is not None)
                      or (detect_tail_max is not None))

            if explicit or detect:
                # we need to load the audio data
                audio_file = AudioFile(audio_file_path, logger=self.logger)
                audio_file.load_data()

                if explicit:
                    self._log("Explicit head or process")
                else:
                    self._log(
                        "No explicit head or process => detecting head/tail")

                    head = 0.0
                    if (detect_head_min is not None) or (detect_head_max
                                                         is not None):
                        self._log("Detecting head...")
                        detect_head_min = gf.safe_float(
                            detect_head_min, gc.SD_MIN_HEAD_LENGTH)
                        detect_head_max = gf.safe_float(
                            detect_head_max, gc.SD_MAX_HEAD_LENGTH)
                        self._log(["detect_head_min is %.3f", detect_head_min])
                        self._log(["detect_head_max is %.3f", detect_head_max])
                        sd = SD(audio_file,
                                self.task.text_file,
                                logger=self.logger)
                        head = sd.detect_head(detect_head_min, detect_head_max)
                        self._log(["Detected head: %.3f", head])

                    tail = 0.0
                    if (detect_tail_min is not None) or (detect_tail_max
                                                         is not None):
                        self._log("Detecting tail...")
                        detect_tail_max = gf.safe_float(
                            detect_tail_max, gc.SD_MAX_TAIL_LENGTH)
                        detect_tail_min = gf.safe_float(
                            detect_tail_min, gc.SD_MIN_TAIL_LENGTH)
                        self._log(["detect_tail_min is %.3f", detect_tail_min])
                        self._log(["detect_tail_max is %.3f", detect_tail_max])
                        sd = SD(audio_file,
                                self.task.text_file,
                                logger=self.logger)
                        tail = sd.detect_tail(detect_tail_min, detect_tail_max)
                        self._log(["Detected tail: %.3f", tail])

                    # sanity check
                    head_length = max(0, head)
                    process_length = max(0,
                                         audio_file.audio_length - tail - head)

                    # we need to set these values
                    # in the config object for later use
                    self.task.configuration.is_audio_file_head_length = head_length
                    self.task.configuration.is_audio_file_process_length = process_length
                    self._log(["Set head_length:    %.3f", head_length])
                    self._log(["Set process_length: %.3f", process_length])

                if head_length is not None:
                    # in case we are reading from config object
                    head_length = float(head_length)
                if process_length is not None:
                    # in case we are reading from config object
                    process_length = float(process_length)
                # note that str() is necessary, as one might be None
                self._log(
                    ["is_audio_file_head_length is %s",
                     str(head_length)])
                self._log([
                    "is_audio_file_process_length is %s",
                    str(process_length)
                ])
                self._log("Trimming audio data...")
                audio_file.trim(head_length, process_length)
                self._log("Trimming audio data... done")
                self._log("Writing audio file...")
                audio_file.write(audio_file_path)
                self._log("Writing audio file... done")
                audio_file.clear_data()
            else:
                # nothing to do
                self._log("No explicit head/process or detect head/tail")

            self._log("Setting head and/or tail: succeeded")
            return True
        except Exception as e:
            self._log("Setting head and/or tail: failed")
            self._log(["Message: %s", str(e)])
            return False
Ejemplo n.º 3
0
    def _synthesize_multiple_python(self,
                                    text_file,
                                    output_file_path,
                                    quit_after=None,
                                    backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # generating wave data for each fragment,
        # and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = "pcm16"
            output_file.audio_channels = 1
            output_file.audio_sample_rate = self.SAMPLE_RATE

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code)
                # store for later output
                anchors.append(
                    [current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([
                        u"Quitting after reached duration %.3f", current_time
                    ])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(
                u"Unexpected exception while calling TTS engine via Python",
                exc, None, type(exc))
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
Ejemplo n.º 4
0
    def _synthesize_multiple_generic(self,
                                     helper_function,
                                     text_file,
                                     output_file_path,
                                     quit_after=None,
                                     backwards=False):
        """
        Synthesize multiple fragments, generic function.

        The ``helper_function`` is a function that takes parameters
        ``(text, voice_code, output_file_path)``
        and returns a tuple
        ``(result, (audio_length, audio_sample_rate, audio_format, audio_samples))``.

        :rtype: tuple (result, (anchors, current_time, num_chars))
        """
        self.log(u"Calling TTS engine using multiple generic function...")

        # get sample rate and codec
        self.log(u"Determining codec and sample rate...")
        if (self.OUTPUT_AUDIO_FORMAT is None) or (len(self.OUTPUT_AUDIO_FORMAT)
                                                  != 3):
            self.log(u"Determining codec and sample rate with dummy text...")
            succeeded, data = helper_function(
                text=u"Dummy text to get sample_rate",
                voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE),
                output_file_path=None)
            if not succeeded:
                self.log_crit(
                    u"An unexpected error occurred in helper_function")
                return (False, None)
            du_nu, sample_rate, codec, da_nu = data
            self.log(
                u"Determining codec and sample rate with dummy text... done")
        else:
            self.log(u"Reading codec and sample rate from OUTPUT_AUDIO_FORMAT")
            codec, channels_nu, sample_rate = self.OUTPUT_AUDIO_FORMAT
        self.log(u"Determining codec and sample rate... done")
        self.log([u"  codec:       %s", codec])
        self.log([u"  sample rate: %d", sample_rate])

        # open output file
        output_file = AudioFile(rconf=self.rconf, logger=self.logger)
        output_file.audio_format = codec
        output_file.audio_channels = 1
        output_file.audio_sample_rate = sample_rate

        # create output
        anchors = []
        current_time = TimeValue("0.000")
        num_chars = 0
        fragments = text_file.fragments
        if backwards:
            fragments = fragments[::-1]
        loop_function = self._loop_use_cache if self.use_cache else self._loop_no_cache
        for num, fragment in enumerate(fragments):
            succeeded, data = loop_function(helper_function=helper_function,
                                            num=num,
                                            fragment=fragment)
            if not succeeded:
                self.log_crit(u"An unexpected error occurred in loop_function")
                return (False, None)
            duration, sr_nu, enc_nu, samples = data
            # store for later output
            anchors.append([current_time, fragment.identifier, fragment.text])
            # increase the character counter
            num_chars += fragment.characters
            # concatenate new samples
            self.log([u"Fragment %d starts at: %.3f", num, current_time])
            if duration > 0:
                self.log([u"Fragment %d duration: %.3f", num, duration])
                current_time += duration
                output_file.add_samples(samples, reverse=backwards)
            else:
                self.log([u"Fragment %d has zero duration", num])
            # check if we must stop synthesizing because we have enough audio
            if (quit_after is not None) and (current_time > quit_after):
                self.log(
                    [u"Quitting after reached duration %.3f", current_time])
                break

        # minimize memory
        self.log(u"Minimizing memory...")
        output_file.minimize_memory()
        self.log(u"Minimizing memory... done")

        # if backwards, we need to reverse the audio samples again
        if backwards:
            self.log(u"Reversing audio samples...")
            output_file.reverse()
            self.log(u"Reversing audio samples... done")

        # write output file
        self.log([u"Writing audio file '%s'", output_file_path])
        output_file.write(file_path=output_file_path)

        # return output
        if backwards:
            self.log_warn(
                u"Please note that anchor time values do not make sense since backwards=True"
            )
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine using multiple generic function... done")
        return (True, (anchors, current_time, num_chars))
Ejemplo n.º 5
0
    def _synthesize_multiple_subprocess(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple fragments via ``subprocess``.

        :rtype: tuple (result, (anchors, current_time, num_chars))
        """
        def synthesize_and_clean(text, voice_code):
            """
            Synthesize a single fragment via subprocess,
            and immediately remove the temporary file.

            :rtype: tuple (duration, sample_rate, encoding, samples)
            """
            self.log(u"Synthesizing text...")
            handler, tmp_destination = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
            result, data = self._synthesize_single_subprocess(
                text=(text + u" "),
                voice_code=voice_code,
                output_file_path=tmp_destination
            )
            self.log([u"Removing temporary file '%s'", tmp_destination])
            gf.delete_file(handler, tmp_destination)
            self.log(u"Synthesizing text... done")
            return data

        self.log(u"Calling TTS engine via subprocess...")

        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = synthesize_and_clean(
                text=u"Dummy text to get sample_rate",
                voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE)
            )

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, samples = synthesize_and_clean(
                    text=fragment.filtered_text,
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # concatenate new samples
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    output_file.add_samples(samples, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # minimize memory
            self.log(u"Minimizing memory...")
            output_file.minimize_memory()
            self.log(u"Minimizing memory... done")

            # if backwards, we need to reverse the audio samples again
            if backwards:
                self.log(u"Reversing audio samples...")
                output_file.reverse()
                self.log(u"Reversing audio samples... done")

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while calling TTS engine via subprocess", exc, False, None)
            return (False, None)

        # return output
        if backwards:
            self.log_warn(u"Please note that anchor time values do not make sense since backwards=True")
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via subprocess... done")
        return (True, (anchors, current_time, num_chars))
Ejemplo n.º 6
0
    def _synthesize_multiple_python(self,
                                    text_file,
                                    output_file_path,
                                    quit_after=None,
                                    backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # TODO in the Speect Python API I was not able to find a way
        #      to generate the wave incrementally
        #      so I essentially copy the subprocess call mechanism:
        #      generating wave data for each fragment,
        #      and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper(
                text=u"Dummy text to get sample_rate",
                voice_code=self.DEFAULT_LANGUAGE)

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                #
                # NOTE since voice_code is actually ignored
                # in _synthesize_single_helper(),
                # the value of voice_code is irrelevant
                #
                # however, in general you need to apply
                # the _language_to_voice_code() function that maps
                # the text language to a voice code
                #
                # here we apply the _language_to_voice_code() defined in super()
                # that sets voice_code = fragment.language
                #
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code)
                # store for later output
                anchors.append(
                    [current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([
                        u"Quitting after reached duration %.3f", current_time
                    ])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(
                u"An unexpected error occurred while calling TTS engine via Python",
                exc, False, None)
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
Ejemplo n.º 7
0
    def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, via Python call.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, TimeValue, int))
        """
        #
        # TODO in the Speect Python API I was not able to find a way
        #      to generate the wave incrementally
        #      so I essentially copy the subprocess call mechanism:
        #      generating wave data for each fragment,
        #      and concatenating them together
        #
        self.log(u"Calling TTS engine via Python...")
        try:
            # get sample rate and encoding
            du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper(
                text=u"Dummy text to get sample_rate",
                voice_code=self.DEFAULT_LANGUAGE
            )

            # open output file
            output_file = AudioFile(rconf=self.rconf, logger=self.logger)
            output_file.audio_format = encoding
            output_file.audio_channels = 1
            output_file.audio_sample_rate = sample_rate

            # create output
            anchors = []
            current_time = TimeValue("0.000")
            num = 0
            num_chars = 0
            fragments = text_file.fragments
            if backwards:
                fragments = fragments[::-1]
            for fragment in fragments:
                # language to voice code
                #
                # NOTE since voice_code is actually ignored
                # in _synthesize_single_helper(),
                # the value of voice_code is irrelevant
                #
                # however, in general you need to apply
                # the _language_to_voice_code() function that maps
                # the text language to a voice code
                #
                # here we apply the _language_to_voice_code() defined in super()
                # that sets voice_code = fragment.language
                #
                voice_code = self._language_to_voice_code(fragment.language)
                # synthesize and get the duration of the output file
                self.log([u"Synthesizing fragment %d", num])
                duration, sr_nu, enc_nu, data = self._synthesize_single_helper(
                    text=(fragment.filtered_text + u" "),
                    voice_code=voice_code
                )
                # store for later output
                anchors.append([current_time, fragment.identifier, fragment.text])
                # increase the character counter
                num_chars += fragment.characters
                # append new data
                self.log([u"Fragment %d starts at: %.3f", num, current_time])
                if duration > 0:
                    self.log([u"Fragment %d duration: %.3f", num, duration])
                    current_time += duration
                    # if backwards, we append the data reversed
                    output_file.add_samples(data, reverse=backwards)
                else:
                    self.log([u"Fragment %d has zero duration", num])
                # increment fragment counter
                num += 1
                # check if we must stop synthesizing because we have enough audio
                if (quit_after is not None) and (current_time > quit_after):
                    self.log([u"Quitting after reached duration %.3f", current_time])
                    break

            # if backwards, we need to reverse the audio samples again
            if backwards:
                output_file.reverse()

            # write output file
            self.log([u"Writing audio file '%s'", output_file_path])
            output_file.write(file_path=output_file_path)
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while calling TTS engine via Python", exc, False, None)
            return (False, None)

        # return output
        # NOTE anchors do not make sense if backwards
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Calling TTS engine via Python... done")
        return (True, (anchors, current_time, num_chars))
Ejemplo n.º 8
0
    def _cut_head_tail(self, audio_file_path):
        """
        Set the audio file head or tail,
        suitably cutting the audio file on disk,
        and setting the corresponding parameters in the task configuration.

        Return a success bool flag
        """
        self._log("Setting head and/or tail")
        try:
            configuration = self.task.configuration
            head_length = configuration.is_audio_file_head_length
            process_length = configuration.is_audio_file_process_length
            detect_head_min = configuration.is_audio_file_detect_head_min
            detect_head_max = configuration.is_audio_file_detect_head_max
            detect_tail_min = configuration.is_audio_file_detect_tail_min
            detect_tail_max = configuration.is_audio_file_detect_tail_max

            # explicit head or process?
            explicit = (head_length is not None) or (process_length is not None)

            # at least one detect parameter?
            detect = (
                (detect_head_min is not None) or
                (detect_head_max is not None) or
                (detect_tail_min is not None) or
                (detect_tail_max is not None)
            )

            if explicit or detect:
                # we need to load the audio data
                audio_file = AudioFile(audio_file_path, logger=self.logger)
                audio_file.load_data()

                if explicit:
                    self._log("Explicit head or process")
                else:
                    self._log("No explicit head or process => detecting head/tail")

                    head = 0.0
                    if (detect_head_min is not None) or (detect_head_max is not None):
                        self._log("Detecting head...")
                        detect_head_min = gf.safe_float(detect_head_min, gc.SD_MIN_HEAD_LENGTH)
                        detect_head_max = gf.safe_float(detect_head_max, gc.SD_MAX_HEAD_LENGTH)
                        self._log(["detect_head_min is %.3f", detect_head_min])
                        self._log(["detect_head_max is %.3f", detect_head_max])
                        sd = SD(audio_file, self.task.text_file, logger=self.logger)
                        head = sd.detect_head(detect_head_min, detect_head_max)
                        self._log(["Detected head: %.3f", head])

                    tail = 0.0
                    if (detect_tail_min is not None) or (detect_tail_max is not None):
                        self._log("Detecting tail...")
                        detect_tail_max = gf.safe_float(detect_tail_max, gc.SD_MAX_TAIL_LENGTH)
                        detect_tail_min = gf.safe_float(detect_tail_min, gc.SD_MIN_TAIL_LENGTH)
                        self._log(["detect_tail_min is %.3f", detect_tail_min])
                        self._log(["detect_tail_max is %.3f", detect_tail_max])
                        sd = SD(audio_file, self.task.text_file, logger=self.logger)
                        tail = sd.detect_tail(detect_tail_min, detect_tail_max)
                        self._log(["Detected tail: %.3f", tail])

                    # sanity check
                    head_length = max(0, head)
                    process_length = max(0, audio_file.audio_length - tail - head)

                    # we need to set these values
                    # in the config object for later use
                    self.task.configuration.is_audio_file_head_length = head_length
                    self.task.configuration.is_audio_file_process_length = process_length
                    self._log(["Set head_length:    %.3f", head_length])
                    self._log(["Set process_length: %.3f", process_length])

                if head_length is not None:
                    # in case we are reading from config object
                    head_length = float(head_length)
                if process_length is not None:
                    # in case we are reading from config object
                    process_length = float(process_length)
                # note that str() is necessary, as one might be None
                self._log(["is_audio_file_head_length is %s", str(head_length)])
                self._log(["is_audio_file_process_length is %s", str(process_length)])
                self._log("Trimming audio data...")
                audio_file.trim(head_length, process_length)
                self._log("Trimming audio data... done")
                self._log("Writing audio file...")
                audio_file.write(audio_file_path)
                self._log("Writing audio file... done")
                audio_file.clear_data()
            else:
                # nothing to do
                self._log("No explicit head/process or detect head/tail")

            self._log("Setting head and/or tail: succeeded")
            return True
        except Exception as e:
            self._log("Setting head and/or tail: failed")
            self._log(["Message: %s", str(e)])
            return False