def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple text fragments, via Python call. Return a tuple (anchors, total_time, num_chars). :rtype: (bool, (list, TimeValue, int)) """ # # generating wave data for each fragment, # and concatenating them together # self.log(u"Calling TTS engine via Python...") try: # open output file output_file = AudioFile(rconf=self.rconf, logger=self.logger) output_file.audio_format = "pcm16" output_file.audio_channels = 1 output_file.audio_sample_rate = self.SAMPLE_RATE # create output anchors = [] current_time = TimeValue("0.000") num = 0 num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] for fragment in fragments: # language to voice code voice_code = self._language_to_voice_code(fragment.language) # synthesize and get the duration of the output file self.log([u"Synthesizing fragment %d", num]) duration, sr_nu, enc_nu, data = self._synthesize_single_helper( text=(fragment.filtered_text + u" "), voice_code=voice_code ) # store for later output anchors.append([current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # append new data self.log([u"Fragment %d starts at: %.3f", num, current_time]) if duration > 0: self.log([u"Fragment %d duration: %.3f", num, duration]) current_time += duration # if backwards, we append the data reversed output_file.add_samples(data, reverse=backwards) else: self.log([u"Fragment %d has zero duration", num]) # increment fragment counter num += 1 # check if we must stop synthesizing because we have enough audio if (quit_after is not None) and (current_time > quit_after): self.log([u"Quitting after reached duration %.3f", current_time]) break # if backwards, we need to reverse the audio samples again if backwards: output_file.reverse() # write output file self.log([u"Writing audio file '%s'", output_file_path]) output_file.write(file_path=output_file_path) except Exception as exc: self.log_exc(u"Unexpected exception while calling TTS engine via Python", exc, None, type(exc)) return (False, None) # return output # NOTE anchors do not make sense if backwards self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Calling TTS engine via Python... done") return (True, (anchors, current_time, num_chars))
def _detect_start(self, min_start_length, max_start_length, metric, backwards=False): """ Detect start """ self._log(["Min start length: %.3f", min_start_length]) self._log(["Max start length: %.3f", max_start_length]) self._log(["Metric: %s", metric]) self._log(["Backwards: %s", str(backwards)]) audio_rate = self.text_file.characters / self.audio_file.audio_length self._log(["Audio rate: %.3f", audio_rate]) self._log("Synthesizing query...") tmp_handler, tmp_file_path = tempfile.mkstemp(suffix=".wav", dir=gf.custom_tmp_dir()) synt = Synthesizer(logger=self.logger) synt_duration = max_start_length * self.QUERY_FACTOR self._log(["Synthesizing %.3f seconds", synt_duration]) result = synt.synthesize(self.text_file, tmp_file_path, quit_after=synt_duration, backwards=backwards) self._log("Synthesizing query... done") query_file = AudioFile(tmp_file_path) if backwards: self._log("Reversing query") query_file.reverse() self._log("Extracting MFCCs for query...") query_file.extract_mfcc(frame_rate=self.frame_rate) query_file.clear_data() self._log("Extracting MFCCs for query... done") self._log("Cleaning up...") self._cleanup(tmp_handler, tmp_file_path) self._log("Cleaning up... done") query_characters = result[2] query_len = query_file.audio_length query_mfcc = query_file.audio_mfcc query_rate = query_characters / query_len stretch_factor = max(1, query_rate / audio_rate) self._log(["Audio rate: %.3f", audio_rate]) self._log(["Query rate: %.3f", query_rate]) self._log(["Stretch factor: %.3f", stretch_factor]) audio_mfcc = self.audio_file.audio_mfcc self._log(["Actual audio has %d frames", audio_mfcc.shape[1]]) audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate) self._log(["Limiting audio to first %d frames", audio_mfcc_end_index]) audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1]) audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index] self._log(["Limited audio has %d frames", audio_mfcc.shape[1]]) l, o = audio_mfcc.shape l, n = query_mfcc.shape # minimum length of a matched interval in the real audio stretched_match_minimum_length = int(n * stretch_factor) self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)]) self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)]) self._log(["Stretch factor: %.3f", stretch_factor]) self._log( ["Required minimum length: %.3f", stretched_match_minimum_length]) self._log("Speech intervals:") for interval in self.audio_speech: self._log([ " %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1] ]) admissible_intervals = [ x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length)) ] self._log("AdmissibleSpeech intervals:") for interval in admissible_intervals: self._log([ " %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1] ]) candidates = [] runs_with_min_length = 0 runs_no_improvement = 0 runs_min_distortion = numpy.inf runs_min_value = numpy.inf for interval in admissible_intervals: if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT: self._log(" Breaking: too many runs without improvement") break if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH: self._log( " Breaking: too many runs with minimum required length") break start_time = interval[0] start_index = self._t2i(start_time) self._log([ "Evaluating interval starting at %d == %.3f ", start_index, start_time ]) if start_index > o: self._log(" Breaking: start index outside audio window") break req_end_index = start_index + stretched_match_minimum_length req_end_time = self._i2t(req_end_index) if req_end_index > o: self._log( " Breaking: not enough audio left in shifted window") break end_index = min(start_index + 2 * n, o) end_time = self._i2t(end_index) self._log([" Start %d == %.3f", start_index, start_time]) self._log([" Req end %d == %.3f", req_end_index, req_end_time]) self._log([" Eff end %d == %.3f", end_index, end_time]) audio_mfcc_sub = audio_mfcc[:, start_index:end_index] l, m = audio_mfcc_sub.shape self._log("Computing DTW...") aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger) aligner.real_wave_full_mfcc = audio_mfcc_sub aligner.synt_wave_full_mfcc = query_mfcc aligner.real_wave_length = self._i2t(m) aligner.synt_wave_length = self._i2t(n) acm = aligner.compute_accumulated_cost_matrix() # transpose, so we have an n x m accumulated cost matrix acm = acm.transpose() last_row = acm[-1, :] self._log("Computing DTW... done") # find the minimum, but its index must be >= stretched_match_minimum_length candidate_argmin_index = numpy.argmin( last_row[stretched_match_minimum_length:]) candidate_length_index = stretched_match_minimum_length + candidate_argmin_index candidate_length_time = self._i2t(candidate_length_index) candidate_value = last_row[candidate_length_index] candidate_end_index = start_index + candidate_length_index candidate_end_time = self._i2t(candidate_end_index) candidate_distortion = candidate_value / candidate_length_index # check if the candidate has minimum length if candidate_length_index == stretched_match_minimum_length: runs_with_min_length += 1 else: runs_with_min_length = 0 # check if the candidate improved the global minimum value if metric == SDMetric.VALUE: if candidate_value < runs_min_value: runs_min_value = candidate_value runs_no_improvement = 0 else: runs_no_improvement += 1 if metric == SDMetric.DISTORTION: if candidate_distortion < runs_min_distortion: runs_min_distortion = candidate_distortion runs_no_improvement = 0 else: runs_no_improvement += 1 # append to the list of candidates self._log([ " Interval start: %d == %.6f", start_index, start_time ]) self._log( [" Interval end: %d == %.6f", end_index, end_time]) self._log([ " Candidate start: %d == %.6f", start_index, start_time ]) self._log([ " Candidate end: %d == %.6f", candidate_end_index, candidate_end_time ]) self._log([ " Candidate length: %d == %.6f", candidate_length_index, candidate_length_time ]) self._log([" Candidate value: %.6f", candidate_value]) self._log([" Candidate distortion: %.6f", candidate_distortion]) candidates.append({ "start_index": start_index, "length": candidate_length_index, "value": candidate_value, "distortion": candidate_distortion }) # select best candidate and return its start time # if we have no best candidate, return 0.0 best_candidate = self._select_best_candidate(candidates, metric) if best_candidate is None: return 0.0 sd_time = self._i2t(max(best_candidate["start_index"], 0)) self._log(["Returning time %.3f", sd_time]) return sd_time
def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple text fragments, via Python call. Return a tuple (anchors, total_time, num_chars). :rtype: (bool, (list, TimeValue, int)) """ # # generating wave data for each fragment, # and concatenating them together # self.log(u"Calling TTS engine via Python...") try: # open output file output_file = AudioFile(rconf=self.rconf, logger=self.logger) output_file.audio_format = "pcm16" output_file.audio_channels = 1 output_file.audio_sample_rate = self.SAMPLE_RATE # create output anchors = [] current_time = TimeValue("0.000") num = 0 num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] for fragment in fragments: # language to voice code voice_code = self._language_to_voice_code(fragment.language) # synthesize and get the duration of the output file self.log([u"Synthesizing fragment %d", num]) duration, sr_nu, enc_nu, data = self._synthesize_single_helper( text=(fragment.filtered_text + u" "), voice_code=voice_code) # store for later output anchors.append( [current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # append new data self.log([u"Fragment %d starts at: %.3f", num, current_time]) if duration > 0: self.log([u"Fragment %d duration: %.3f", num, duration]) current_time += duration # if backwards, we append the data reversed output_file.add_samples(data, reverse=backwards) else: self.log([u"Fragment %d has zero duration", num]) # increment fragment counter num += 1 # check if we must stop synthesizing because we have enough audio if (quit_after is not None) and (current_time > quit_after): self.log([ u"Quitting after reached duration %.3f", current_time ]) break # if backwards, we need to reverse the audio samples again if backwards: output_file.reverse() # write output file self.log([u"Writing audio file '%s'", output_file_path]) output_file.write(file_path=output_file_path) except Exception as exc: self.log_exc( u"Unexpected exception while calling TTS engine via Python", exc, None, type(exc)) return (False, None) # return output # NOTE anchors do not make sense if backwards self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Calling TTS engine via Python... done") return (True, (anchors, current_time, num_chars))
def _synthesize_multiple_generic(self, helper_function, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple fragments, generic function. The ``helper_function`` is a function that takes parameters ``(text, voice_code, output_file_path)`` and returns a tuple ``(result, (audio_length, audio_sample_rate, audio_format, audio_samples))``. :rtype: tuple (result, (anchors, current_time, num_chars)) """ self.log(u"Calling TTS engine using multiple generic function...") # get sample rate and codec self.log(u"Determining codec and sample rate...") if (self.OUTPUT_AUDIO_FORMAT is None) or (len(self.OUTPUT_AUDIO_FORMAT) != 3): self.log(u"Determining codec and sample rate with dummy text...") succeeded, data = helper_function( text=u"Dummy text to get sample_rate", voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE), output_file_path=None) if not succeeded: self.log_crit( u"An unexpected error occurred in helper_function") return (False, None) du_nu, sample_rate, codec, da_nu = data self.log( u"Determining codec and sample rate with dummy text... done") else: self.log(u"Reading codec and sample rate from OUTPUT_AUDIO_FORMAT") codec, channels_nu, sample_rate = self.OUTPUT_AUDIO_FORMAT self.log(u"Determining codec and sample rate... done") self.log([u" codec: %s", codec]) self.log([u" sample rate: %d", sample_rate]) # open output file output_file = AudioFile(rconf=self.rconf, logger=self.logger) output_file.audio_format = codec output_file.audio_channels = 1 output_file.audio_sample_rate = sample_rate # create output anchors = [] current_time = TimeValue("0.000") num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] loop_function = self._loop_use_cache if self.use_cache else self._loop_no_cache for num, fragment in enumerate(fragments): succeeded, data = loop_function(helper_function=helper_function, num=num, fragment=fragment) if not succeeded: self.log_crit(u"An unexpected error occurred in loop_function") return (False, None) duration, sr_nu, enc_nu, samples = data # store for later output anchors.append([current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # concatenate new samples self.log([u"Fragment %d starts at: %.3f", num, current_time]) if duration > 0: self.log([u"Fragment %d duration: %.3f", num, duration]) current_time += duration output_file.add_samples(samples, reverse=backwards) else: self.log([u"Fragment %d has zero duration", num]) # check if we must stop synthesizing because we have enough audio if (quit_after is not None) and (current_time > quit_after): self.log( [u"Quitting after reached duration %.3f", current_time]) break # minimize memory self.log(u"Minimizing memory...") output_file.minimize_memory() self.log(u"Minimizing memory... done") # if backwards, we need to reverse the audio samples again if backwards: self.log(u"Reversing audio samples...") output_file.reverse() self.log(u"Reversing audio samples... done") # write output file self.log([u"Writing audio file '%s'", output_file_path]) output_file.write(file_path=output_file_path) # return output if backwards: self.log_warn( u"Please note that anchor time values do not make sense since backwards=True" ) self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Calling TTS engine using multiple generic function... done") return (True, (anchors, current_time, num_chars))
def _synthesize_multiple_subprocess(self, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple fragments via ``subprocess``. :rtype: tuple (result, (anchors, current_time, num_chars)) """ def synthesize_and_clean(text, voice_code): """ Synthesize a single fragment via subprocess, and immediately remove the temporary file. :rtype: tuple (duration, sample_rate, encoding, samples) """ self.log(u"Synthesizing text...") handler, tmp_destination = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) result, data = self._synthesize_single_subprocess( text=(text + u" "), voice_code=voice_code, output_file_path=tmp_destination ) self.log([u"Removing temporary file '%s'", tmp_destination]) gf.delete_file(handler, tmp_destination) self.log(u"Synthesizing text... done") return data self.log(u"Calling TTS engine via subprocess...") try: # get sample rate and encoding du_nu, sample_rate, encoding, da_nu = synthesize_and_clean( text=u"Dummy text to get sample_rate", voice_code=self._language_to_voice_code(self.DEFAULT_LANGUAGE) ) # open output file output_file = AudioFile(rconf=self.rconf, logger=self.logger) output_file.audio_format = encoding output_file.audio_channels = 1 output_file.audio_sample_rate = sample_rate # create output anchors = [] current_time = TimeValue("0.000") num = 0 num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] for fragment in fragments: # language to voice code voice_code = self._language_to_voice_code(fragment.language) # synthesize and get the duration of the output file self.log([u"Synthesizing fragment %d", num]) duration, sr_nu, enc_nu, samples = synthesize_and_clean( text=fragment.filtered_text, voice_code=voice_code ) # store for later output anchors.append([current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # concatenate new samples self.log([u"Fragment %d starts at: %.3f", num, current_time]) if duration > 0: self.log([u"Fragment %d duration: %.3f", num, duration]) current_time += duration output_file.add_samples(samples, reverse=backwards) else: self.log([u"Fragment %d has zero duration", num]) # increment fragment counter num += 1 # check if we must stop synthesizing because we have enough audio if (quit_after is not None) and (current_time > quit_after): self.log([u"Quitting after reached duration %.3f", current_time]) break # minimize memory self.log(u"Minimizing memory...") output_file.minimize_memory() self.log(u"Minimizing memory... done") # if backwards, we need to reverse the audio samples again if backwards: self.log(u"Reversing audio samples...") output_file.reverse() self.log(u"Reversing audio samples... done") # write output file self.log([u"Writing audio file '%s'", output_file_path]) output_file.write(file_path=output_file_path) except Exception as exc: self.log_exc(u"An unexpected error occurred while calling TTS engine via subprocess", exc, False, None) return (False, None) # return output if backwards: self.log_warn(u"Please note that anchor time values do not make sense since backwards=True") self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Calling TTS engine via subprocess... done") return (True, (anchors, current_time, num_chars))
def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple text fragments, via Python call. Return a tuple (anchors, total_time, num_chars). :rtype: (bool, (list, TimeValue, int)) """ # # TODO in the Speect Python API I was not able to find a way # to generate the wave incrementally # so I essentially copy the subprocess call mechanism: # generating wave data for each fragment, # and concatenating them together # self.log(u"Calling TTS engine via Python...") try: # get sample rate and encoding du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper( text=u"Dummy text to get sample_rate", voice_code=self.DEFAULT_LANGUAGE) # open output file output_file = AudioFile(rconf=self.rconf, logger=self.logger) output_file.audio_format = encoding output_file.audio_channels = 1 output_file.audio_sample_rate = sample_rate # create output anchors = [] current_time = TimeValue("0.000") num = 0 num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] for fragment in fragments: # language to voice code # # NOTE since voice_code is actually ignored # in _synthesize_single_helper(), # the value of voice_code is irrelevant # # however, in general you need to apply # the _language_to_voice_code() function that maps # the text language to a voice code # # here we apply the _language_to_voice_code() defined in super() # that sets voice_code = fragment.language # voice_code = self._language_to_voice_code(fragment.language) # synthesize and get the duration of the output file self.log([u"Synthesizing fragment %d", num]) duration, sr_nu, enc_nu, data = self._synthesize_single_helper( text=(fragment.filtered_text + u" "), voice_code=voice_code) # store for later output anchors.append( [current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # append new data self.log([u"Fragment %d starts at: %.3f", num, current_time]) if duration > 0: self.log([u"Fragment %d duration: %.3f", num, duration]) current_time += duration # if backwards, we append the data reversed output_file.add_samples(data, reverse=backwards) else: self.log([u"Fragment %d has zero duration", num]) # increment fragment counter num += 1 # check if we must stop synthesizing because we have enough audio if (quit_after is not None) and (current_time > quit_after): self.log([ u"Quitting after reached duration %.3f", current_time ]) break # if backwards, we need to reverse the audio samples again if backwards: output_file.reverse() # write output file self.log([u"Writing audio file '%s'", output_file_path]) output_file.write(file_path=output_file_path) except Exception as exc: self.log_exc( u"An unexpected error occurred while calling TTS engine via Python", exc, False, None) return (False, None) # return output # NOTE anchors do not make sense if backwards self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Calling TTS engine via Python... done") return (True, (anchors, current_time, num_chars))
def _synthesize_multiple_python(self, text_file, output_file_path, quit_after=None, backwards=False): """ Synthesize multiple text fragments, via Python call. Return a tuple (anchors, total_time, num_chars). :rtype: (bool, (list, TimeValue, int)) """ # # TODO in the Speect Python API I was not able to find a way # to generate the wave incrementally # so I essentially copy the subprocess call mechanism: # generating wave data for each fragment, # and concatenating them together # self.log(u"Calling TTS engine via Python...") try: # get sample rate and encoding du_nu, sample_rate, encoding, da_nu = self._synthesize_single_helper( text=u"Dummy text to get sample_rate", voice_code=self.DEFAULT_LANGUAGE ) # open output file output_file = AudioFile(rconf=self.rconf, logger=self.logger) output_file.audio_format = encoding output_file.audio_channels = 1 output_file.audio_sample_rate = sample_rate # create output anchors = [] current_time = TimeValue("0.000") num = 0 num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] for fragment in fragments: # language to voice code # # NOTE since voice_code is actually ignored # in _synthesize_single_helper(), # the value of voice_code is irrelevant # # however, in general you need to apply # the _language_to_voice_code() function that maps # the text language to a voice code # # here we apply the _language_to_voice_code() defined in super() # that sets voice_code = fragment.language # voice_code = self._language_to_voice_code(fragment.language) # synthesize and get the duration of the output file self.log([u"Synthesizing fragment %d", num]) duration, sr_nu, enc_nu, data = self._synthesize_single_helper( text=(fragment.filtered_text + u" "), voice_code=voice_code ) # store for later output anchors.append([current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # append new data self.log([u"Fragment %d starts at: %.3f", num, current_time]) if duration > 0: self.log([u"Fragment %d duration: %.3f", num, duration]) current_time += duration # if backwards, we append the data reversed output_file.add_samples(data, reverse=backwards) else: self.log([u"Fragment %d has zero duration", num]) # increment fragment counter num += 1 # check if we must stop synthesizing because we have enough audio if (quit_after is not None) and (current_time > quit_after): self.log([u"Quitting after reached duration %.3f", current_time]) break # if backwards, we need to reverse the audio samples again if backwards: output_file.reverse() # write output file self.log([u"Writing audio file '%s'", output_file_path]) output_file.write(file_path=output_file_path) except Exception as exc: self.log_exc(u"An unexpected error occurred while calling TTS engine via Python", exc, False, None) return (False, None) # return output # NOTE anchors do not make sense if backwards self.log([u"Returning %d time anchors", len(anchors)]) self.log([u"Current time %.3f", current_time]) self.log([u"Synthesized %d characters", num_chars]) self.log(u"Calling TTS engine via Python... done") return (True, (anchors, current_time, num_chars))
def _detect_start(self, min_start_length, max_start_length, metric, backwards=False): """ Detect start """ self._log(["Min start length: %.3f", min_start_length]) self._log(["Max start length: %.3f", max_start_length]) self._log(["Metric: %s", metric]) self._log(["Backwards: %s", str(backwards)]) audio_rate = self.text_file.characters / self.audio_file.audio_length self._log(["Audio rate: %.3f", audio_rate]) self._log("Synthesizing query...") tmp_handler, tmp_file_path = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) synt = Synthesizer(logger=self.logger) synt_duration = max_start_length * self.QUERY_FACTOR self._log(["Synthesizing %.3f seconds", synt_duration]) result = synt.synthesize( self.text_file, tmp_file_path, quit_after=synt_duration, backwards=backwards ) self._log("Synthesizing query... done") query_file = AudioFile(tmp_file_path) if backwards: self._log("Reversing query") query_file.reverse() self._log("Extracting MFCCs for query...") query_file.extract_mfcc(frame_rate=self.frame_rate) query_file.clear_data() self._log("Extracting MFCCs for query... done") self._log("Cleaning up...") self._cleanup(tmp_handler, tmp_file_path) self._log("Cleaning up... done") query_characters = result[2] query_len = query_file.audio_length query_mfcc = query_file.audio_mfcc query_rate = query_characters / query_len stretch_factor = max(1, query_rate / audio_rate) self._log(["Audio rate: %.3f", audio_rate]) self._log(["Query rate: %.3f", query_rate]) self._log(["Stretch factor: %.3f", stretch_factor]) audio_mfcc = self.audio_file.audio_mfcc self._log(["Actual audio has %d frames", audio_mfcc.shape[1]]) audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate) self._log(["Limiting audio to first %d frames", audio_mfcc_end_index]) audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1]) audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index] self._log(["Limited audio has %d frames", audio_mfcc.shape[1]]) l, o = audio_mfcc.shape l, n = query_mfcc.shape # minimum length of a matched interval in the real audio stretched_match_minimum_length = int(n * stretch_factor) self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)]) self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)]) self._log(["Stretch factor: %.3f", stretch_factor]) self._log(["Required minimum length: %.3f", stretched_match_minimum_length]) self._log("Speech intervals:") for interval in self.audio_speech: self._log([" %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]]) admissible_intervals = [x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length))] self._log("AdmissibleSpeech intervals:") for interval in admissible_intervals: self._log([" %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]]) candidates = [] runs_with_min_length = 0 runs_no_improvement = 0 runs_min_distortion = numpy.inf runs_min_value = numpy.inf for interval in admissible_intervals: if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT: self._log(" Breaking: too many runs without improvement") break if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH: self._log(" Breaking: too many runs with minimum required length") break start_time = interval[0] start_index = self._t2i(start_time) self._log(["Evaluating interval starting at %d == %.3f ", start_index, start_time]) if start_index > o: self._log(" Breaking: start index outside audio window") break req_end_index = start_index + stretched_match_minimum_length req_end_time = self._i2t(req_end_index) if req_end_index > o: self._log(" Breaking: not enough audio left in shifted window") break end_index = min(start_index + 2 * n, o) end_time = self._i2t(end_index) self._log([" Start %d == %.3f", start_index, start_time]) self._log([" Req end %d == %.3f", req_end_index, req_end_time]) self._log([" Eff end %d == %.3f", end_index, end_time]) audio_mfcc_sub = audio_mfcc[:, start_index:end_index] l, m = audio_mfcc_sub.shape self._log("Computing DTW...") aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger) aligner.real_wave_full_mfcc = audio_mfcc_sub aligner.synt_wave_full_mfcc = query_mfcc aligner.real_wave_length = self._i2t(m) aligner.synt_wave_length = self._i2t(n) acm = aligner.compute_accumulated_cost_matrix() # transpose, so we have an n x m accumulated cost matrix acm = acm.transpose() last_row = acm[-1, :] self._log("Computing DTW... done") # find the minimum, but its index must be >= stretched_match_minimum_length candidate_argmin_index = numpy.argmin(last_row[stretched_match_minimum_length:]) candidate_length_index = stretched_match_minimum_length + candidate_argmin_index candidate_length_time = self._i2t(candidate_length_index) candidate_value = last_row[candidate_length_index] candidate_end_index = start_index + candidate_length_index candidate_end_time = self._i2t(candidate_end_index) candidate_distortion = candidate_value / candidate_length_index # check if the candidate has minimum length if candidate_length_index == stretched_match_minimum_length: runs_with_min_length += 1 else: runs_with_min_length = 0 # check if the candidate improved the global minimum value if metric == SDMetric.VALUE: if candidate_value < runs_min_value: runs_min_value = candidate_value runs_no_improvement = 0 else: runs_no_improvement += 1 if metric == SDMetric.DISTORTION: if candidate_distortion < runs_min_distortion: runs_min_distortion = candidate_distortion runs_no_improvement = 0 else: runs_no_improvement += 1 # append to the list of candidates self._log([" Interval start: %d == %.6f", start_index, start_time]) self._log([" Interval end: %d == %.6f", end_index, end_time]) self._log([" Candidate start: %d == %.6f", start_index, start_time]) self._log([" Candidate end: %d == %.6f", candidate_end_index, candidate_end_time]) self._log([" Candidate length: %d == %.6f", candidate_length_index, candidate_length_time]) self._log([" Candidate value: %.6f", candidate_value]) self._log([" Candidate distortion: %.6f", candidate_distortion]) candidates.append({ "start_index": start_index, "length": candidate_length_index, "value": candidate_value, "distortion": candidate_distortion }) # select best candidate and return its start time # if we have no best candidate, return 0.0 best_candidate = self._select_best_candidate(candidates, metric) if best_candidate is None: return 0.0 sd_time = self._i2t(max(best_candidate["start_index"], 0)) self._log(["Returning time %.3f", sd_time]) return sd_time