def main(): """ Entry point """ if len(sys.argv) < 3: usage() return file_path = sys.argv[1] save_path = sys.argv[2] if not gf.can_run_c_extension(): print "[WARN] Unable to load Python C Extensions" print "[WARN] Running the slower pure Python code" print "[WARN] See the README file for directions to compile the Python C Extensions" audiofile = AudioFile(file_path) audiofile.load_data() audiofile.extract_mfcc() audiofile.clear_data() numpy.savetxt(save_path, audiofile.audio_mfcc) print "[INFO] MFCCs saved to %s" % (save_path)
def _cut_head_tail(self, audio_file_path): """ Set the audio file head or tail, suitably cutting the audio file on disk, and setting the corresponding parameters in the task configuration. Return a success bool flag """ self._log("Setting head and/or tail") try: configuration = self.task.configuration head_length = configuration.is_audio_file_head_length process_length = configuration.is_audio_file_process_length detect_head_min = configuration.is_audio_file_detect_head_min detect_head_max = configuration.is_audio_file_detect_head_max detect_tail_min = configuration.is_audio_file_detect_tail_min detect_tail_max = configuration.is_audio_file_detect_tail_max # explicit head or process? explicit = (head_length is not None) or (process_length is not None) # at least one detect parameter? detect = ((detect_head_min is not None) or (detect_head_max is not None) or (detect_tail_min is not None) or (detect_tail_max is not None)) if explicit or detect: # we need to load the audio data audio_file = AudioFile(audio_file_path, logger=self.logger) audio_file.load_data() if explicit: self._log("Explicit head or process") else: self._log( "No explicit head or process => detecting head/tail") head = 0.0 if (detect_head_min is not None) or (detect_head_max is not None): self._log("Detecting head...") detect_head_min = gf.safe_float( detect_head_min, gc.SD_MIN_HEAD_LENGTH) detect_head_max = gf.safe_float( detect_head_max, gc.SD_MAX_HEAD_LENGTH) self._log(["detect_head_min is %.3f", detect_head_min]) self._log(["detect_head_max is %.3f", detect_head_max]) sd = SD(audio_file, self.task.text_file, logger=self.logger) head = sd.detect_head(detect_head_min, detect_head_max) self._log(["Detected head: %.3f", head]) tail = 0.0 if (detect_tail_min is not None) or (detect_tail_max is not None): self._log("Detecting tail...") detect_tail_max = gf.safe_float( detect_tail_max, gc.SD_MAX_TAIL_LENGTH) detect_tail_min = gf.safe_float( detect_tail_min, gc.SD_MIN_TAIL_LENGTH) self._log(["detect_tail_min is %.3f", detect_tail_min]) self._log(["detect_tail_max is %.3f", detect_tail_max]) sd = SD(audio_file, self.task.text_file, logger=self.logger) tail = sd.detect_tail(detect_tail_min, detect_tail_max) self._log(["Detected tail: %.3f", tail]) # sanity check head_length = max(0, head) process_length = max(0, audio_file.audio_length - tail - head) # we need to set these values # in the config object for later use self.task.configuration.is_audio_file_head_length = head_length self.task.configuration.is_audio_file_process_length = process_length self._log(["Set head_length: %.3f", head_length]) self._log(["Set process_length: %.3f", process_length]) if head_length is not None: # in case we are reading from config object head_length = float(head_length) if process_length is not None: # in case we are reading from config object process_length = float(process_length) # note that str() is necessary, as one might be None self._log( ["is_audio_file_head_length is %s", str(head_length)]) self._log([ "is_audio_file_process_length is %s", str(process_length) ]) self._log("Trimming audio data...") audio_file.trim(head_length, process_length) self._log("Trimming audio data... done") self._log("Writing audio file...") audio_file.write(audio_file_path) self._log("Writing audio file... done") audio_file.clear_data() else: # nothing to do self._log("No explicit head/process or detect head/tail") self._log("Setting head and/or tail: succeeded") return True except Exception as e: self._log("Setting head and/or tail: failed") self._log(["Message: %s", str(e)]) return False
def _detect_start(self, min_start_length, max_start_length, metric, backwards=False): """ Detect start """ self._log(["Min start length: %.3f", min_start_length]) self._log(["Max start length: %.3f", max_start_length]) self._log(["Metric: %s", metric]) self._log(["Backwards: %s", str(backwards)]) audio_rate = self.text_file.characters / self.audio_file.audio_length self._log(["Audio rate: %.3f", audio_rate]) self._log("Synthesizing query...") tmp_handler, tmp_file_path = tempfile.mkstemp(suffix=".wav", dir=gf.custom_tmp_dir()) synt = Synthesizer(logger=self.logger) synt_duration = max_start_length * self.QUERY_FACTOR self._log(["Synthesizing %.3f seconds", synt_duration]) result = synt.synthesize(self.text_file, tmp_file_path, quit_after=synt_duration, backwards=backwards) self._log("Synthesizing query... done") query_file = AudioFile(tmp_file_path) if backwards: self._log("Reversing query") query_file.reverse() self._log("Extracting MFCCs for query...") query_file.extract_mfcc(frame_rate=self.frame_rate) query_file.clear_data() self._log("Extracting MFCCs for query... done") self._log("Cleaning up...") self._cleanup(tmp_handler, tmp_file_path) self._log("Cleaning up... done") query_characters = result[2] query_len = query_file.audio_length query_mfcc = query_file.audio_mfcc query_rate = query_characters / query_len stretch_factor = max(1, query_rate / audio_rate) self._log(["Audio rate: %.3f", audio_rate]) self._log(["Query rate: %.3f", query_rate]) self._log(["Stretch factor: %.3f", stretch_factor]) audio_mfcc = self.audio_file.audio_mfcc self._log(["Actual audio has %d frames", audio_mfcc.shape[1]]) audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate) self._log(["Limiting audio to first %d frames", audio_mfcc_end_index]) audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1]) audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index] self._log(["Limited audio has %d frames", audio_mfcc.shape[1]]) l, o = audio_mfcc.shape l, n = query_mfcc.shape # minimum length of a matched interval in the real audio stretched_match_minimum_length = int(n * stretch_factor) self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)]) self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)]) self._log(["Stretch factor: %.3f", stretch_factor]) self._log( ["Required minimum length: %.3f", stretched_match_minimum_length]) self._log("Speech intervals:") for interval in self.audio_speech: self._log([ " %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1] ]) admissible_intervals = [ x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length)) ] self._log("AdmissibleSpeech intervals:") for interval in admissible_intervals: self._log([ " %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1] ]) candidates = [] runs_with_min_length = 0 runs_no_improvement = 0 runs_min_distortion = numpy.inf runs_min_value = numpy.inf for interval in admissible_intervals: if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT: self._log(" Breaking: too many runs without improvement") break if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH: self._log( " Breaking: too many runs with minimum required length") break start_time = interval[0] start_index = self._t2i(start_time) self._log([ "Evaluating interval starting at %d == %.3f ", start_index, start_time ]) if start_index > o: self._log(" Breaking: start index outside audio window") break req_end_index = start_index + stretched_match_minimum_length req_end_time = self._i2t(req_end_index) if req_end_index > o: self._log( " Breaking: not enough audio left in shifted window") break end_index = min(start_index + 2 * n, o) end_time = self._i2t(end_index) self._log([" Start %d == %.3f", start_index, start_time]) self._log([" Req end %d == %.3f", req_end_index, req_end_time]) self._log([" Eff end %d == %.3f", end_index, end_time]) audio_mfcc_sub = audio_mfcc[:, start_index:end_index] l, m = audio_mfcc_sub.shape self._log("Computing DTW...") aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger) aligner.real_wave_full_mfcc = audio_mfcc_sub aligner.synt_wave_full_mfcc = query_mfcc aligner.real_wave_length = self._i2t(m) aligner.synt_wave_length = self._i2t(n) acm = aligner.compute_accumulated_cost_matrix() # transpose, so we have an n x m accumulated cost matrix acm = acm.transpose() last_row = acm[-1, :] self._log("Computing DTW... done") # find the minimum, but its index must be >= stretched_match_minimum_length candidate_argmin_index = numpy.argmin( last_row[stretched_match_minimum_length:]) candidate_length_index = stretched_match_minimum_length + candidate_argmin_index candidate_length_time = self._i2t(candidate_length_index) candidate_value = last_row[candidate_length_index] candidate_end_index = start_index + candidate_length_index candidate_end_time = self._i2t(candidate_end_index) candidate_distortion = candidate_value / candidate_length_index # check if the candidate has minimum length if candidate_length_index == stretched_match_minimum_length: runs_with_min_length += 1 else: runs_with_min_length = 0 # check if the candidate improved the global minimum value if metric == SDMetric.VALUE: if candidate_value < runs_min_value: runs_min_value = candidate_value runs_no_improvement = 0 else: runs_no_improvement += 1 if metric == SDMetric.DISTORTION: if candidate_distortion < runs_min_distortion: runs_min_distortion = candidate_distortion runs_no_improvement = 0 else: runs_no_improvement += 1 # append to the list of candidates self._log([ " Interval start: %d == %.6f", start_index, start_time ]) self._log( [" Interval end: %d == %.6f", end_index, end_time]) self._log([ " Candidate start: %d == %.6f", start_index, start_time ]) self._log([ " Candidate end: %d == %.6f", candidate_end_index, candidate_end_time ]) self._log([ " Candidate length: %d == %.6f", candidate_length_index, candidate_length_time ]) self._log([" Candidate value: %.6f", candidate_value]) self._log([" Candidate distortion: %.6f", candidate_distortion]) candidates.append({ "start_index": start_index, "length": candidate_length_index, "value": candidate_value, "distortion": candidate_distortion }) # select best candidate and return its start time # if we have no best candidate, return 0.0 best_candidate = self._select_best_candidate(candidates, metric) if best_candidate is None: return 0.0 sd_time = self._i2t(max(best_candidate["start_index"], 0)) self._log(["Returning time %.3f", sd_time]) return sd_time
class AudioFileMFCC(Loggable): """ A monoaural (single channel) WAVE audio file, represented as a NumPy 2D matrix of Mel-frequency ceptral coefficients (MFCC). The matrix is "fat", that is, its number of rows is equal to the number of MFCC coefficients and its number of columns is equal to the number of window shifts in the audio file. The number of MFCC coefficients and the MFCC window shift can be modified via the :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_SIZE` and :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_WINDOW_SHIFT` keys in the ``rconf`` object. If ``mfcc_matrix`` is not ``None``, it will be used as the MFCC matrix. If ``file_path`` or ``audio_file`` is not ``None``, the MFCCs will be computed upon creation of the object, possibly converting to PCM16 Mono WAVE and/or loading audio data in memory. The MFCCs for the entire wave are divided into three contiguous intervals (possibly, zero-length):: HEAD = [:middle_begin[ MIDDLE = [middle_begin:middle_end[ TAIL = [middle_end:[ The usual NumPy convention of including the left/start index and excluding the right/end index is adopted. For alignment purposes, only the ``MIDDLE`` portion of the wave is taken into account; the ``HEAD`` and ``TAIL`` intervals are ignored. This class heavily uses NumPy views and in-place operations to avoid creating temporary data or copying data around. :param string file_path: the path of the PCM16 mono WAVE file, or ``None`` :param tuple file_format: the format of the audio file, if known in advance: ``(codec, channels, rate)`` or ``None`` :param mfcc_matrix: the MFCC matrix to be set, or ``None`` :type mfcc_matrix: :class:`numpy.ndarray` :param audio_file: an audio file, or ``None`` :type audio_file: :class:`~aeneas.audiofile.AudioFile` :param rconf: a runtime configuration :type rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration` :param logger: the logger object :type logger: :class:`~aeneas.logger.Logger` :raises: ValueError: if ``file_path``, ``audio_file``, and ``mfcc_matrix`` are all ``None`` .. versionadded:: 1.5.0 """ TAG = u"AudioFileMFCC" def __init__(self, file_path=None, file_format=None, mfcc_matrix=None, audio_file=None, rconf=None, logger=None): if (file_path is None) and (audio_file is None) and (mfcc_matrix is None): raise ValueError( u"You must initialize with at least one of: file_path, audio_file, or mfcc_matrix" ) super(AudioFileMFCC, self).__init__(rconf=rconf, logger=logger) self.file_path = file_path self.audio_file = audio_file self.is_reversed = False self.__mfcc = None self.__mfcc_mask = None self.__mfcc_mask_map = None self.__speech_intervals = None self.__nonspeech_intervals = None self.log(u"Initializing MFCCs...") if mfcc_matrix is not None: self.__mfcc = mfcc_matrix self.audio_length = self.all_length * self.rconf.mws elif (self.file_path is not None) or (self.audio_file is not None): audio_file_was_none = False if self.audio_file is None: audio_file_was_none = True self.audio_file = AudioFile(file_path=self.file_path, file_format=file_format, rconf=self.rconf, logger=self.logger) # NOTE load audio samples into memory, if not present already self.audio_file.audio_samples gf.run_c_extension_with_fallback(self.log, "cmfcc", self._compute_mfcc_c_extension, self._compute_mfcc_pure_python, (), rconf=self.rconf) self.audio_length = self.audio_file.audio_length if audio_file_was_none: self.log(u"Clearing the audio data...") self.audio_file.clear_data() self.audio_file = None self.log(u"Clearing the audio data... done") self.__middle_begin = 0 self.__middle_end = self.__mfcc.shape[1] self.log(u"Initializing MFCCs... done") def __unicode__(self): msg = [ u"File path: %s" % self.file_path, u"Audio length (s): %s" % gf.safe_float(self.audio_length), ] return u"\n".join(msg) def __str__(self): return gf.safe_str(self.__unicode__()) @property def all_mfcc(self): """ The MFCCs of the entire audio file, that is, HEAD + MIDDLE + TAIL. :rtype: :class:`numpy.ndarray` (2D) """ return self.__mfcc @property def all_length(self): """ The length, in MFCC coefficients, of the entire audio file, that is, HEAD + MIDDLE + TAIL. :rtype: int """ return self.__mfcc.shape[1] @property def middle_mfcc(self): """ The MFCCs of the middle part of the audio file, that is, without HEAD and TAIL. :rtype: :class:`numpy.ndarray` (2D) """ return self.__mfcc[:, self.__middle_begin:self.__middle_end] @property def middle_length(self): """ The length, in MFCC coefficients, of the middle part of the audio file, that is, without HEAD and TAIL. :rtype: int """ return self.__middle_end - self.__middle_begin @property def middle_map(self): """ Return the map from the MFCC frame indices in the MIDDLE portion of the wave to the MFCC FULL frame indices, that is, an ``numpy.arange(self.middle_begin, self.middle_end)``. NOTE: to translate indices of MIDDLE, instead of using fancy indexing with the result of this function, you might want to simply add ``self.head_length``. This function is provided mostly for consistency with the MASKED case. :rtype: :class:`numpy.ndarray` (1D) """ return numpy.arange(self.__middle_begin, self.__middle_end) @property def head_length(self): """ The length, in MFCC coefficients, of the HEAD of the audio file. :rtype: int """ return self.__middle_begin @property def tail_length(self): """ The length, in MFCC coefficients, of the TAIL of the audio file. :rtype: int """ return self.all_length - self.__middle_end @property def tail_begin(self): """ The index, in MFCC coefficients, where the TAIL of the audio file starts. :rtype: int """ return self.__middle_end @property def audio_length(self): """ The length, in seconds, of the audio file. This value is the actual length of the audio file, computed as ``number of samples / sample_rate``, hence it might differ than ``len(self.__mfcc) * mfcc_window_shift``. :rtype: :class:`~aeneas.timevalue.TimeValue` """ return self.__audio_length @audio_length.setter def audio_length(self, audio_length): self.__audio_length = audio_length @property def is_reversed(self): """ Return ``True`` if currently reversed. :rtype: bool """ return self.__is_reversed @is_reversed.setter def is_reversed(self, is_reversed): self.__is_reversed = is_reversed @property def masked_mfcc(self): """ Return the MFCC speech frames in the FULL wave. :rtype: :class:`numpy.ndarray` (2D) """ self._ensure_mfcc_mask() return self.__mfcc[:, self.__mfcc_mask] @property def masked_length(self): """ Return the number of MFCC speech frames in the FULL wave. :rtype: int """ self._ensure_mfcc_mask() return len(self.__mfcc_mask_map) @property def masked_map(self): """ Return the map from the MFCC speech frame indices to the MFCC FULL frame indices. :rtype: :class:`numpy.ndarray` (1D) """ self._ensure_mfcc_mask() return self.__mfcc_mask_map @property def masked_middle_mfcc(self): """ Return the MFCC speech frames in the MIDDLE portion of the wave. :rtype: :class:`numpy.ndarray` (2D) """ begin, end = self._masked_middle_begin_end() return (self.masked_mfcc)[:, begin:end] @property def masked_middle_length(self): """ Return the number of MFCC speech frames in the MIDDLE portion of the wave. :rtype: int """ begin, end = self._masked_middle_begin_end() return end - begin @property def masked_middle_map(self): """ Return the map from the MFCC speech frame indices in the MIDDLE portion of the wave to the MFCC FULL frame indices. :rtype: :class:`numpy.ndarray` (1D) """ begin, end = self._masked_middle_begin_end() return self.__mfcc_mask_map[begin:end] def _masked_middle_begin_end(self): """ Return the begin and end indices w.r.t. ``self.__mfcc_mask_map``, corresponding to indices in the MIDDLE portion of the wave, that is, which fall between ``self.__middle_begin`` and ``self.__middle_end`` in ``self.__mfcc``. :rtype: (int, int) """ self._ensure_mfcc_mask() begin = numpy.searchsorted(self.__mfcc_mask_map, self.__middle_begin, side="left") end = numpy.searchsorted(self.__mfcc_mask_map, self.__middle_end, side="right") return (begin, end) def intervals(self, speech=True, time=True): """ Return a list of intervals:: [(b_1, e_1), (b_2, e_2), ..., (b_k, e_k)] where ``b_i`` is the time when the ``i``-th interval begins, and ``e_i`` is the time when it ends. :param bool speech: if ``True``, return speech intervals, otherwise return nonspeech intervals :param bool time: if ``True``, return values in seconds (:class:`~aeneas.timevalue.TimeValue`), otherwise in indices (int) :rtype: list of pairs (see above) """ self._ensure_mfcc_mask() if speech: self.log(u"Converting speech runs to intervals") intervals = self.__speech_intervals else: self.log(u"Converting nonspeech runs to intervals") intervals = self.__nonspeech_intervals if time: mws = self.rconf.mws return [(i[0] * mws, (i[1] + 1) * mws) for i in intervals] return intervals def inside_nonspeech(self, index): """ If ``index`` is contained in a nonspeech interval, return a pair ``(interval_begin, interval_end)`` such that ``interval_begin <= index < interval_end``, i.e., ``interval_end`` is assumed not to be included. Otherwise, return ``None``. :rtype: ``None`` or tuple """ self._ensure_mfcc_mask() if (index < 0) or (index >= self.all_length) or (self.__mfcc_mask[index]): return None return self._binary_search_intervals(self.__nonspeech_intervals, index) @classmethod def _binary_search_intervals(cls, intervals, index): """ Binary search for the interval containing index, assuming there is such an interval. This function should never return ``None``. """ start = 0 end = len(intervals) - 1 while start <= end: middle_index = start + ((end - start) // 2) middle = intervals[middle_index] if (middle[0] <= index) and (index < middle[1]): return middle elif middle[0] > index: end = middle_index - 1 else: start = middle_index + 1 return None @property def middle_begin(self): """ Return the index where MIDDLE starts. :rtype: int """ return self.__middle_begin @middle_begin.setter def middle_begin(self, index): """ Set the index where MIDDLE starts. :param int index: the new index for MIDDLE begin """ if (index < 0) or (index > self.all_length): raise ValueError(u"The given index is not valid") self.__middle_begin = index @property def middle_begin_seconds(self): """ Return the time instant, in seconds, where MIDDLE starts. :rtype: :class:`~aeneas.timevalue.TimeValue` """ return TimeValue(self.__middle_begin) * self.rconf.mws @property def middle_end(self): """ Return the index (+1) where MIDDLE ends. :rtype: int """ return self.__middle_end @middle_end.setter def middle_end(self, index): """ Set the index (+1) where MIDDLE ends. :param int index: the new index for MIDDLE end """ if (index < 0) or (index > self.all_length): raise ValueError(u"The given index is not valid") self.__middle_end = index @property def middle_end_seconds(self): """ Return the time instant, in seconds, where MIDDLE ends. :rtype: :class:`~aeneas.timevalue.TimeValue` """ return TimeValue(self.__middle_end) * self.rconf.mws def _ensure_mfcc_mask(self): """ Ensure that ``run_vad()`` has already been called, and hence ``self.__mfcc_mask`` has a meaningful value. """ if self.__mfcc_mask is None: self.log(u"VAD was not run: running it now") self.run_vad() def _compute_mfcc_c_extension(self): """ Compute MFCCs using the Python C extension cmfcc. """ self.log(u"Computing MFCCs using C extension...") try: self.log(u"Importing cmfcc...") import aeneas.cmfcc.cmfcc self.log(u"Importing cmfcc... done") self.__mfcc = (aeneas.cmfcc.cmfcc.compute_from_data( self.audio_file.audio_samples, self.audio_file.audio_sample_rate, self.rconf[RuntimeConfiguration.MFCC_FILTERS], self.rconf[RuntimeConfiguration.MFCC_SIZE], self.rconf[RuntimeConfiguration.MFCC_FFT_ORDER], self.rconf[RuntimeConfiguration.MFCC_LOWER_FREQUENCY], self.rconf[RuntimeConfiguration.MFCC_UPPER_FREQUENCY], self.rconf[RuntimeConfiguration.MFCC_EMPHASIS_FACTOR], self.rconf[RuntimeConfiguration.MFCC_WINDOW_LENGTH], self.rconf[RuntimeConfiguration.MFCC_WINDOW_SHIFT])[0] ).transpose() self.log(u"Computing MFCCs using C extension... done") return (True, None) except Exception as exc: self.log_exc(u"An unexpected error occurred while running cmfcc", exc, False, None) return (False, None) def _compute_mfcc_pure_python(self): """ Compute MFCCs using the pure Python code. """ self.log(u"Computing MFCCs using pure Python code...") try: self.__mfcc = MFCC( rconf=self.rconf, logger=self.logger).compute_from_data( self.audio_file.audio_samples, self.audio_file.audio_sample_rate).transpose() self.log(u"Computing MFCCs using pure Python code... done") return (True, None) except Exception as exc: self.log_exc( u"An unexpected error occurred while running pure Python code", exc, False, None) return (False, None) def reverse(self): """ Reverse the audio file. The reversing is done efficiently using NumPy views inplace instead of swapping values. Only speech and nonspeech intervals are actually recomputed as Python lists. """ self.log(u"Reversing...") all_length = self.all_length self.__mfcc = self.__mfcc[:, ::-1] tmp = self.__middle_end self.__middle_end = all_length - self.__middle_begin self.__middle_begin = all_length - tmp if self.__mfcc_mask is not None: self.__mfcc_mask = self.__mfcc_mask[::-1] # equivalent to # self.__mfcc_mask_map = ((all_length - 1) - self.__mfcc_mask_map)[::-1] # but done in place using NumPy view self.__mfcc_mask_map *= -1 self.__mfcc_mask_map += all_length - 1 self.__mfcc_mask_map = self.__mfcc_mask_map[::-1] self.__speech_intervals = [(all_length - i[1], all_length - i[0]) for i in self.__speech_intervals[::-1]] self.__nonspeech_intervals = [ (all_length - i[1], all_length - i[0]) for i in self.__nonspeech_intervals[::-1] ] self.is_reversed = not self.is_reversed self.log(u"Reversing...done") def run_vad(self): """ Determine which frames contain speech and nonspeech, and store the resulting boolean mask internally. """ def _compute_runs(array): """ Compute runs as a list of arrays, each containing the indices of a contiguous run. :param array: the data array :type array: :class:`numpy.ndarray` (1D) :rtype: list of :class:`numpy.ndarray` (1D) """ if len(array) < 1: return [] return numpy.split(array, numpy.where(numpy.diff(array) != 1)[0] + 1) self.log(u"Creating VAD object") vad = VAD(rconf=self.rconf, logger=self.logger) self.log(u"Running VAD...") self.__mfcc_mask = vad.run_vad(self.__mfcc[0]) self.__mfcc_mask_map = (numpy.where(self.__mfcc_mask))[0] self.log(u"Running VAD... done") self.log(u"Storing speech and nonspeech intervals...") # where( == True) already computed, reusing # COMMENTED runs = _compute_runs((numpy.where(self.__mfcc_mask))[0]) runs = _compute_runs(self.__mfcc_mask_map) self.__speech_intervals = [(r[0], r[-1]) for r in runs] # where( == False) not already computed, computing now runs = _compute_runs((numpy.where(~self.__mfcc_mask))[0]) self.__nonspeech_intervals = [(r[0], r[-1]) for r in runs] self.log(u"Storing speech and nonspeech intervals... done") def set_head_middle_tail(self, head_length=None, middle_length=None, tail_length=None): """ Set the HEAD, MIDDLE, TAIL explicitly. If a parameter is ``None``, it will be ignored. If both ``middle_length`` and ``tail_length`` are specified, only ``middle_length`` will be applied. :param head_length: the length of HEAD, in seconds :type head_length: :class:`~aeneas.timevalue.TimeValue` :param middle_length: the length of MIDDLE, in seconds :type middle_length: :class:`~aeneas.timevalue.TimeValue` :param tail_length: the length of TAIL, in seconds :type tail_length: :class:`~aeneas.timevalue.TimeValue` :raises: TypeError: if one of the arguments is not ``None`` or :class:`~aeneas.timevalue.TimeValue` :raises: ValueError: if one of the arguments is greater than the length of the audio file """ for variable, name in [(head_length, "head_length"), (middle_length, "middle_length"), (tail_length, "tail_length")]: if (variable is not None) and (not isinstance(variable, TimeValue)): raise TypeError(u"%s is not None or TimeValue" % name) if (variable is not None) and (variable > self.audio_length): raise ValueError( u"%s is greater than the length of the audio file" % name) self.log(u"Setting head middle tail...") mws = self.rconf.mws self.log([ u"Before: 0 %d %d %d", self.middle_begin, self.middle_end, self.all_length ]) if head_length is not None: self.middle_begin = int(head_length / mws) if middle_length is not None: self.middle_end = self.middle_begin + int(middle_length / mws) elif tail_length is not None: self.middle_end = self.all_length - int(tail_length / mws) self.log([ u"After: 0 %d %d %d", self.middle_begin, self.middle_end, self.all_length ]) self.log(u"Setting head middle tail... done")
class AudioFileMFCC(Loggable): """ A monoaural (single channel) WAVE audio file, represented as a NumPy 2D matrix of Mel-frequency ceptral coefficients (MFCC). The matrix is "fat", that is, its number of rows is equal to the number of MFCC coefficients and its number of columns is equal to the number of window shifts in the audio file. The number of MFCC coefficients and the MFCC window shift can be modified via the :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_SIZE` and :data:`~aeneas.runtimeconfiguration.RuntimeConfiguration.MFCC_WINDOW_SHIFT` keys in the ``rconf`` object. If ``mfcc_matrix`` is not ``None``, it will be used as the MFCC matrix. If ``file_path`` or ``audio_file`` is not ``None``, the MFCCs will be computed upon creation of the object, possibly converting to PCM16 Mono WAVE and/or loading audio data in memory. The MFCCs for the entire wave are divided into three contiguous intervals (possibly, zero-length):: HEAD = [:middle_begin[ MIDDLE = [middle_begin:middle_end[ TAIL = [middle_end:[ The usual NumPy convention of including the left/start index and excluding the right/end index is adopted. For alignment purposes, only the ``MIDDLE`` portion of the wave is taken into account; the ``HEAD`` and ``TAIL`` intervals are ignored. This class heavily uses NumPy views and in-place operations to avoid creating temporary data or copying data around. :param string file_path: the path of the PCM16 mono WAVE file, or ``None`` :param bool file_path_is_mono_wave: set to ``True`` if the audio file at ``file_path`` is a PCM16 mono WAVE file :param mfcc_matrix: the MFCC matrix to be set, or ``None`` :type mfcc_matrix: :class:`numpy.ndarray` :param audio_file: an audio file, or ``None`` :type audio_file: :class:`~aeneas.audiofile.AudioFile` :param rconf: a runtime configuration :type rconf: :class:`~aeneas.runtimeconfiguration.RuntimeConfiguration` :param logger: the logger object :type logger: :class:`~aeneas.logger.Logger` :raises: ValueError: if ``file_path``, ``audio_file``, and ``mfcc_matrix`` are all ``None`` .. versionadded:: 1.5.0 """ TAG = u"AudioFileMFCC" def __init__( self, file_path=None, file_path_is_mono_wave=False, mfcc_matrix=None, audio_file=None, rconf=None, logger=None ): if (file_path is None) and (audio_file is None) and (mfcc_matrix is None): raise ValueError(u"You must initialize with at least one of: file_path, audio_file, or mfcc_matrix") super(AudioFileMFCC, self).__init__(rconf=rconf, logger=logger) self.file_path = file_path self.audio_file = audio_file self.is_reversed = False self.__mfcc = None self.__mfcc_mask = None self.__mfcc_mask_map = None self.__speech_intervals = None self.__nonspeech_intervals = None self.log(u"Initializing MFCCs...") if mfcc_matrix is not None: self.__mfcc = mfcc_matrix self.audio_length = self.all_length * self.rconf.mws elif (self.file_path is not None) or (self.audio_file is not None): audio_file_was_none = False if self.audio_file is None: audio_file_was_none = True self.audio_file = AudioFile( self.file_path, is_mono_wave=file_path_is_mono_wave, rconf=self.rconf, logger=self.logger ) # NOTE load audio samples into memory, if not present already self.audio_file.audio_samples gf.run_c_extension_with_fallback( self.log, "cmfcc", self._compute_mfcc_c_extension, self._compute_mfcc_pure_python, (), rconf=self.rconf ) self.audio_length = self.audio_file.audio_length if audio_file_was_none: self.log(u"Clearing the audio data...") self.audio_file.clear_data() self.audio_file = None self.log(u"Clearing the audio data... done") self.__middle_begin = 0 self.__middle_end = self.__mfcc.shape[1] self.log(u"Initializing MFCCs... done") def __unicode__(self): msg = [ u"File path: %s" % self.file_path, u"Audio length (s): %s" % gf.safe_float(self.audio_length), ] return u"\n".join(msg) def __str__(self): return gf.safe_str(self.__unicode__()) @property def all_mfcc(self): """ The MFCCs of the entire audio file, that is, HEAD + MIDDLE + TAIL. :rtype: :class:`numpy.ndarray` (2D) """ return self.__mfcc @property def all_length(self): """ The length, in MFCC coefficients, of the entire audio file, that is, HEAD + MIDDLE + TAIL. :rtype: int """ return self.__mfcc.shape[1] @property def middle_mfcc(self): """ The MFCCs of the middle part of the audio file, that is, without HEAD and TAIL. :rtype: :class:`numpy.ndarray` (2D) """ return self.__mfcc[:, self.__middle_begin:self.__middle_end] @property def middle_length(self): """ The length, in MFCC coefficients, of the middle part of the audio file, that is, without HEAD and TAIL. :rtype: int """ return self.__middle_end - self.__middle_begin @property def middle_map(self): """ Return the map from the MFCC frame indices in the MIDDLE portion of the wave to the MFCC FULL frame indices, that is, an ``numpy.arange(self.middle_begin, self.middle_end)``. NOTE: to translate indices of MIDDLE, instead of using fancy indexing with the result of this function, you might want to simply add ``self.head_length``. This function is provided mostly for consistency with the MASKED case. :rtype: :class:`numpy.ndarray` (1D) """ return numpy.arange(self.__middle_begin, self.__middle_end) @property def head_length(self): """ The length, in MFCC coefficients, of the HEAD of the audio file. :rtype: int """ return self.__middle_begin @property def tail_length(self): """ The length, in MFCC coefficients, of the TAIL of the audio file. :rtype: int """ return self.all_length - self.__middle_end @property def tail_begin(self): """ The index, in MFCC coefficients, where the TAIL of the audio file starts. :rtype: int """ return self.__middle_end @property def audio_length(self): """ The length, in seconds, of the audio file. This value is the actual length of the audio file, computed as ``number of samples / sample_rate``, hence it might differ than ``len(self.__mfcc) * mfcc_window_shift``. :rtype: :class:`~aeneas.timevalue.TimeValue` """ return self.__audio_length @audio_length.setter def audio_length(self, audio_length): self.__audio_length = audio_length @property def is_reversed(self): """ Return ``True`` if currently reversed. :rtype: bool """ return self.__is_reversed @is_reversed.setter def is_reversed(self, is_reversed): self.__is_reversed = is_reversed @property def masked_mfcc(self): """ Return the MFCC speech frames in the FULL wave. :rtype: :class:`numpy.ndarray` (2D) """ self._ensure_mfcc_mask() return self.__mfcc[:, self.__mfcc_mask] @property def masked_length(self): """ Return the number of MFCC speech frames in the FULL wave. :rtype: int """ self._ensure_mfcc_mask() return len(self.__mfcc_mask_map) @property def masked_map(self): """ Return the map from the MFCC speech frame indices to the MFCC FULL frame indices. :rtype: :class:`numpy.ndarray` (1D) """ self._ensure_mfcc_mask() return self.__mfcc_mask_map @property def masked_middle_mfcc(self): """ Return the MFCC speech frames in the MIDDLE portion of the wave. :rtype: :class:`numpy.ndarray` (2D) """ begin, end = self._masked_middle_begin_end() return (self.masked_mfcc)[:, begin:end] @property def masked_middle_length(self): """ Return the number of MFCC speech frames in the MIDDLE portion of the wave. :rtype: int """ begin, end = self._masked_middle_begin_end() return end - begin @property def masked_middle_map(self): """ Return the map from the MFCC speech frame indices in the MIDDLE portion of the wave to the MFCC FULL frame indices. :rtype: :class:`numpy.ndarray` (1D) """ begin, end = self._masked_middle_begin_end() return self.__mfcc_mask_map[begin:end] def _masked_middle_begin_end(self): """ Return the begin and end indices w.r.t. ``self.__mfcc_mask_map``, corresponding to indices in the MIDDLE portion of the wave, that is, which fall between ``self.__middle_begin`` and ``self.__middle_end`` in ``self.__mfcc``. :rtype: (int, int) """ self._ensure_mfcc_mask() begin = numpy.searchsorted(self.__mfcc_mask_map, self.__middle_begin, side="left") end = numpy.searchsorted(self.__mfcc_mask_map, self.__middle_end, side="right") return (begin, end) def intervals(self, speech=True, time=True): """ Return a list of intervals:: [(b_1, e_1), (b_2, e_2), ..., (b_k, e_k)] where ``b_i`` is the time when the ``i``-th interval begins, and ``e_i`` is the time when it ends. :param bool speech: if ``True``, return speech intervals, otherwise return nonspeech intervals :param bool time: if ``True``, return values in seconds (:class:`~aeneas.timevalue.TimeValue`), otherwise in indices (int) :rtype: list of pairs (see above) """ self._ensure_mfcc_mask() if speech: self.log(u"Converting speech runs to intervals") intervals = self.__speech_intervals else: self.log(u"Converting nonspeech runs to intervals") intervals = self.__nonspeech_intervals if time: mws = self.rconf.mws return [(i[0] * mws, (i[1] + 1) * mws) for i in intervals] return intervals def inside_nonspeech(self, index): """ If ``index`` is contained in a nonspeech interval, return a pair ``(interval_begin, interval_end)`` such that ``interval_begin <= index < interval_end``, i.e., ``interval_end`` is assumed not to be included. Otherwise, return ``None``. :rtype: ``None`` or tuple """ self._ensure_mfcc_mask() if (index < 0) or (index >= self.all_length) or (self.__mfcc_mask[index]): return None return self._binary_search_intervals(self.__nonspeech_intervals, index) @classmethod def _binary_search_intervals(cls, intervals, index): """ Binary search for the interval containing index, assuming there is such an interval. This function should never return ``None``. """ start = 0 end = len(intervals) - 1 while start <= end: middle_index = start + ((end - start) // 2) middle = intervals[middle_index] if (middle[0] <= index) and (index < middle[1]): return middle elif middle[0] > index: end = middle_index - 1 else: start = middle_index + 1 return None @property def middle_begin(self): """ Return the index where MIDDLE starts. :rtype: int """ return self.__middle_begin @middle_begin.setter def middle_begin(self, index): """ Set the index where MIDDLE starts. :param int index: the new index for MIDDLE begin """ if (index < 0) or (index > self.all_length): raise ValueError(u"The given index is not valid") self.__middle_begin = index @property def middle_begin_seconds(self): """ Return the time instant, in seconds, where MIDDLE starts. :rtype: :class:`~aeneas.timevalue.TimeValue` """ return TimeValue(self.__middle_begin) * self.rconf.mws @property def middle_end(self): """ Return the index (+1) where MIDDLE ends. :rtype: int """ return self.__middle_end @middle_end.setter def middle_end(self, index): """ Set the index (+1) where MIDDLE ends. :param int index: the new index for MIDDLE end """ if (index < 0) or (index > self.all_length): raise ValueError(u"The given index is not valid") self.__middle_end = index @property def middle_end_seconds(self): """ Return the time instant, in seconds, where MIDDLE ends. :rtype: :class:`~aeneas.timevalue.TimeValue` """ return TimeValue(self.__middle_end) * self.rconf.mws def _ensure_mfcc_mask(self): """ Ensure that ``run_vad()`` has already been called, and hence ``self.__mfcc_mask`` has a meaningful value. """ if self.__mfcc_mask is None: self.log(u"VAD was not run: running it now") self.run_vad() def _compute_mfcc_c_extension(self): """ Compute MFCCs using the Python C extension cmfcc. """ self.log(u"Computing MFCCs using C extension...") try: self.log(u"Importing cmfcc...") import aeneas.cmfcc.cmfcc self.log(u"Importing cmfcc... done") self.__mfcc = (aeneas.cmfcc.cmfcc.compute_from_data( self.audio_file.audio_samples, self.audio_file.audio_sample_rate, self.rconf[RuntimeConfiguration.MFCC_FILTERS], self.rconf[RuntimeConfiguration.MFCC_SIZE], self.rconf[RuntimeConfiguration.MFCC_FFT_ORDER], self.rconf[RuntimeConfiguration.MFCC_LOWER_FREQUENCY], self.rconf[RuntimeConfiguration.MFCC_UPPER_FREQUENCY], self.rconf[RuntimeConfiguration.MFCC_EMPHASIS_FACTOR], self.rconf[RuntimeConfiguration.MFCC_WINDOW_LENGTH], self.rconf[RuntimeConfiguration.MFCC_WINDOW_SHIFT] )[0]).transpose() self.log(u"Computing MFCCs using C extension... done") return (True, None) except Exception as exc: self.log_exc(u"An unexpected error occurred while running cmfcc", exc, False, None) return (False, None) def _compute_mfcc_pure_python(self): """ Compute MFCCs using the pure Python code. """ self.log(u"Computing MFCCs using pure Python code...") try: self.__mfcc = MFCC( rconf=self.rconf, logger=self.logger ).compute_from_data( self.audio_file.audio_samples, self.audio_file.audio_sample_rate ).transpose() self.log(u"Computing MFCCs using pure Python code... done") return (True, None) except Exception as exc: self.log_exc(u"An unexpected error occurred while running pure Python code", exc, False, None) return (False, None) def reverse(self): """ Reverse the audio file. The reversing is done efficiently using NumPy views inplace instead of swapping values. Only speech and nonspeech intervals are actually recomputed as Python lists. """ self.log(u"Reversing...") all_length = self.all_length self.__mfcc = self.__mfcc[:, ::-1] tmp = self.__middle_end self.__middle_end = all_length - self.__middle_begin self.__middle_begin = all_length - tmp if self.__mfcc_mask is not None: self.__mfcc_mask = self.__mfcc_mask[::-1] # equivalent to # self.__mfcc_mask_map = ((all_length - 1) - self.__mfcc_mask_map)[::-1] # but done in place using NumPy view self.__mfcc_mask_map *= -1 self.__mfcc_mask_map += all_length - 1 self.__mfcc_mask_map = self.__mfcc_mask_map[::-1] self.__speech_intervals = [(all_length - i[1], all_length - i[0]) for i in self.__speech_intervals[::-1]] self.__nonspeech_intervals = [(all_length - i[1], all_length - i[0]) for i in self.__nonspeech_intervals[::-1]] self.is_reversed = not self.is_reversed self.log(u"Reversing...done") def run_vad(self): """ Determine which frames contain speech and nonspeech, and store the resulting boolean mask internally. """ def _compute_runs(array): """ Compute runs as a list of arrays, each containing the indices of a contiguous run. :param array: the data array :type array: :class:`numpy.ndarray` (1D) :rtype: list of :class:`numpy.ndarray` (1D) """ if len(array) < 1: return [] return numpy.split(array, numpy.where(numpy.diff(array) != 1)[0] + 1) self.log(u"Creating VAD object") vad = VAD(rconf=self.rconf, logger=self.logger) self.log(u"Running VAD...") self.__mfcc_mask = vad.run_vad(self.__mfcc[0]) self.__mfcc_mask_map = (numpy.where(self.__mfcc_mask))[0] self.log(u"Running VAD... done") self.log(u"Storing speech and nonspeech intervals...") # where( == True) already computed, reusing #runs = _compute_runs((numpy.where(self.__mfcc_mask))[0]) runs = _compute_runs(self.__mfcc_mask_map) self.__speech_intervals = [(r[0], r[-1]) for r in runs] # where( == False) not already computed, computing now runs = _compute_runs((numpy.where(~self.__mfcc_mask))[0]) self.__nonspeech_intervals = [(r[0], r[-1]) for r in runs] self.log(u"Storing speech and nonspeech intervals... done") def set_head_middle_tail(self, head_length=None, middle_length=None, tail_length=None): """ Set the HEAD, MIDDLE, TAIL explicitly. If a parameter is ``None``, it will be ignored. If both ``middle_length`` and ``tail_length`` are specified, only ``middle_length`` will be applied. :param head_length: the length of HEAD, in seconds :type head_length: :class:`~aeneas.timevalue.TimeValue` :param middle_length: the length of MIDDLE, in seconds :type middle_length: :class:`~aeneas.timevalue.TimeValue` :param tail_length: the length of TAIL, in seconds :type tail_length: :class:`~aeneas.timevalue.TimeValue` :raises: TypeError: if one of the arguments is not ``None`` or :class:`~aeneas.timevalue.TimeValue` """ for variable, name in [ (head_length, "head_length"), (middle_length, "middle_length"), (tail_length, "tail_length") ]: if (variable is not None) and (not isinstance(variable, TimeValue)): raise TypeError(u"%s is not None or TimeValue" % name) self.log(u"Setting head middle tail...") mws = self.rconf.mws self.log([u"Before: 0 %d %d %d", self.middle_begin, self.middle_end, self.all_length]) if head_length is not None: self.middle_begin = int(head_length / mws) if middle_length is not None: self.middle_end = self.middle_begin + int(middle_length / mws) elif tail_length is not None: self.middle_end = self.all_length - int(tail_length / mws) self.log([u"After: 0 %d %d %d", self.middle_begin, self.middle_end, self.all_length]) self.log(u"Setting head middle tail... done")
def _cut_head_tail(self, audio_file_path): """ Set the audio file head or tail, suitably cutting the audio file on disk, and setting the corresponding parameters in the task configuration. Return a success bool flag """ self._log("Setting head and/or tail") try: configuration = self.task.configuration head_length = configuration.is_audio_file_head_length process_length = configuration.is_audio_file_process_length detect_head_min = configuration.is_audio_file_detect_head_min detect_head_max = configuration.is_audio_file_detect_head_max detect_tail_min = configuration.is_audio_file_detect_tail_min detect_tail_max = configuration.is_audio_file_detect_tail_max # explicit head or process? explicit = (head_length is not None) or (process_length is not None) # at least one detect parameter? detect = ( (detect_head_min is not None) or (detect_head_max is not None) or (detect_tail_min is not None) or (detect_tail_max is not None) ) if explicit or detect: # we need to load the audio data audio_file = AudioFile(audio_file_path, logger=self.logger) audio_file.load_data() if explicit: self._log("Explicit head or process") else: self._log("No explicit head or process => detecting head/tail") head = 0.0 if (detect_head_min is not None) or (detect_head_max is not None): self._log("Detecting head...") detect_head_min = gf.safe_float(detect_head_min, gc.SD_MIN_HEAD_LENGTH) detect_head_max = gf.safe_float(detect_head_max, gc.SD_MAX_HEAD_LENGTH) self._log(["detect_head_min is %.3f", detect_head_min]) self._log(["detect_head_max is %.3f", detect_head_max]) sd = SD(audio_file, self.task.text_file, logger=self.logger) head = sd.detect_head(detect_head_min, detect_head_max) self._log(["Detected head: %.3f", head]) tail = 0.0 if (detect_tail_min is not None) or (detect_tail_max is not None): self._log("Detecting tail...") detect_tail_max = gf.safe_float(detect_tail_max, gc.SD_MAX_TAIL_LENGTH) detect_tail_min = gf.safe_float(detect_tail_min, gc.SD_MIN_TAIL_LENGTH) self._log(["detect_tail_min is %.3f", detect_tail_min]) self._log(["detect_tail_max is %.3f", detect_tail_max]) sd = SD(audio_file, self.task.text_file, logger=self.logger) tail = sd.detect_tail(detect_tail_min, detect_tail_max) self._log(["Detected tail: %.3f", tail]) # sanity check head_length = max(0, head) process_length = max(0, audio_file.audio_length - tail - head) # we need to set these values # in the config object for later use self.task.configuration.is_audio_file_head_length = head_length self.task.configuration.is_audio_file_process_length = process_length self._log(["Set head_length: %.3f", head_length]) self._log(["Set process_length: %.3f", process_length]) if head_length is not None: # in case we are reading from config object head_length = float(head_length) if process_length is not None: # in case we are reading from config object process_length = float(process_length) # note that str() is necessary, as one might be None self._log(["is_audio_file_head_length is %s", str(head_length)]) self._log(["is_audio_file_process_length is %s", str(process_length)]) self._log("Trimming audio data...") audio_file.trim(head_length, process_length) self._log("Trimming audio data... done") self._log("Writing audio file...") audio_file.write(audio_file_path) self._log("Writing audio file... done") audio_file.clear_data() else: # nothing to do self._log("No explicit head/process or detect head/tail") self._log("Setting head and/or tail: succeeded") return True except Exception as e: self._log("Setting head and/or tail: failed") self._log(["Message: %s", str(e)]) return False
def _detect_start(self, min_start_length, max_start_length, metric, backwards=False): """ Detect start """ self._log(["Min start length: %.3f", min_start_length]) self._log(["Max start length: %.3f", max_start_length]) self._log(["Metric: %s", metric]) self._log(["Backwards: %s", str(backwards)]) audio_rate = self.text_file.characters / self.audio_file.audio_length self._log(["Audio rate: %.3f", audio_rate]) self._log("Synthesizing query...") tmp_handler, tmp_file_path = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) synt = Synthesizer(logger=self.logger) synt_duration = max_start_length * self.QUERY_FACTOR self._log(["Synthesizing %.3f seconds", synt_duration]) result = synt.synthesize( self.text_file, tmp_file_path, quit_after=synt_duration, backwards=backwards ) self._log("Synthesizing query... done") query_file = AudioFile(tmp_file_path) if backwards: self._log("Reversing query") query_file.reverse() self._log("Extracting MFCCs for query...") query_file.extract_mfcc(frame_rate=self.frame_rate) query_file.clear_data() self._log("Extracting MFCCs for query... done") self._log("Cleaning up...") self._cleanup(tmp_handler, tmp_file_path) self._log("Cleaning up... done") query_characters = result[2] query_len = query_file.audio_length query_mfcc = query_file.audio_mfcc query_rate = query_characters / query_len stretch_factor = max(1, query_rate / audio_rate) self._log(["Audio rate: %.3f", audio_rate]) self._log(["Query rate: %.3f", query_rate]) self._log(["Stretch factor: %.3f", stretch_factor]) audio_mfcc = self.audio_file.audio_mfcc self._log(["Actual audio has %d frames", audio_mfcc.shape[1]]) audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate) self._log(["Limiting audio to first %d frames", audio_mfcc_end_index]) audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1]) audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index] self._log(["Limited audio has %d frames", audio_mfcc.shape[1]]) l, o = audio_mfcc.shape l, n = query_mfcc.shape # minimum length of a matched interval in the real audio stretched_match_minimum_length = int(n * stretch_factor) self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)]) self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)]) self._log(["Stretch factor: %.3f", stretch_factor]) self._log(["Required minimum length: %.3f", stretched_match_minimum_length]) self._log("Speech intervals:") for interval in self.audio_speech: self._log([" %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]]) admissible_intervals = [x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length))] self._log("AdmissibleSpeech intervals:") for interval in admissible_intervals: self._log([" %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]]) candidates = [] runs_with_min_length = 0 runs_no_improvement = 0 runs_min_distortion = numpy.inf runs_min_value = numpy.inf for interval in admissible_intervals: if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT: self._log(" Breaking: too many runs without improvement") break if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH: self._log(" Breaking: too many runs with minimum required length") break start_time = interval[0] start_index = self._t2i(start_time) self._log(["Evaluating interval starting at %d == %.3f ", start_index, start_time]) if start_index > o: self._log(" Breaking: start index outside audio window") break req_end_index = start_index + stretched_match_minimum_length req_end_time = self._i2t(req_end_index) if req_end_index > o: self._log(" Breaking: not enough audio left in shifted window") break end_index = min(start_index + 2 * n, o) end_time = self._i2t(end_index) self._log([" Start %d == %.3f", start_index, start_time]) self._log([" Req end %d == %.3f", req_end_index, req_end_time]) self._log([" Eff end %d == %.3f", end_index, end_time]) audio_mfcc_sub = audio_mfcc[:, start_index:end_index] l, m = audio_mfcc_sub.shape self._log("Computing DTW...") aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger) aligner.real_wave_full_mfcc = audio_mfcc_sub aligner.synt_wave_full_mfcc = query_mfcc aligner.real_wave_length = self._i2t(m) aligner.synt_wave_length = self._i2t(n) acm = aligner.compute_accumulated_cost_matrix() # transpose, so we have an n x m accumulated cost matrix acm = acm.transpose() last_row = acm[-1, :] self._log("Computing DTW... done") # find the minimum, but its index must be >= stretched_match_minimum_length candidate_argmin_index = numpy.argmin(last_row[stretched_match_minimum_length:]) candidate_length_index = stretched_match_minimum_length + candidate_argmin_index candidate_length_time = self._i2t(candidate_length_index) candidate_value = last_row[candidate_length_index] candidate_end_index = start_index + candidate_length_index candidate_end_time = self._i2t(candidate_end_index) candidate_distortion = candidate_value / candidate_length_index # check if the candidate has minimum length if candidate_length_index == stretched_match_minimum_length: runs_with_min_length += 1 else: runs_with_min_length = 0 # check if the candidate improved the global minimum value if metric == SDMetric.VALUE: if candidate_value < runs_min_value: runs_min_value = candidate_value runs_no_improvement = 0 else: runs_no_improvement += 1 if metric == SDMetric.DISTORTION: if candidate_distortion < runs_min_distortion: runs_min_distortion = candidate_distortion runs_no_improvement = 0 else: runs_no_improvement += 1 # append to the list of candidates self._log([" Interval start: %d == %.6f", start_index, start_time]) self._log([" Interval end: %d == %.6f", end_index, end_time]) self._log([" Candidate start: %d == %.6f", start_index, start_time]) self._log([" Candidate end: %d == %.6f", candidate_end_index, candidate_end_time]) self._log([" Candidate length: %d == %.6f", candidate_length_index, candidate_length_time]) self._log([" Candidate value: %.6f", candidate_value]) self._log([" Candidate distortion: %.6f", candidate_distortion]) candidates.append({ "start_index": start_index, "length": candidate_length_index, "value": candidate_value, "distortion": candidate_distortion }) # select best candidate and return its start time # if we have no best candidate, return 0.0 best_candidate = self._select_best_candidate(candidates, metric) if best_candidate is None: return 0.0 sd_time = self._i2t(max(best_candidate["start_index"], 0)) self._log(["Returning time %.3f", sd_time]) return sd_time