def _align_waves(self, real_path, synt_path): """ Align two ``wav`` files. Return a pair: 1. a success bool flag 2. the computed alignment map, that is, a list of pairs of floats, each representing corresponding time instants in the real and synt wave, respectively ``[real_time, synt_time]`` """ self._log("Aligning waves") try: self._log("Creating DTWAligner object") aligner = DTWAligner(real_path, synt_path, logger=self.logger) self._log("Computing MFCC...") aligner.compute_mfcc() self._log("Computing MFCC... done") self._log("Computing path...") aligner.compute_path() self._log("Computing path... done") self._log("Computing map...") computed_map = aligner.computed_map self._log("Computing map... done") self._log("Aligning waves: succeeded") return (True, computed_map) except Exception as e: self._log("Aligning waves: failed") self._log(["Message: %s", str(e)]) return (False, None)
def test_set_synt_wave_mfcc(self): af = AudioFileMFCC(self.AUDIO_FILE) aligner = DTWAligner(synt_wave_mfcc=af) self.assertIsNone(aligner.real_wave_mfcc) self.assertIsNotNone(aligner.synt_wave_mfcc) self.assertIsNone(aligner.real_wave_path) self.assertIsNone(aligner.synt_wave_path)
def _align_waves(self, real_wave_mfcc, synt_wave_mfcc, synt_anchors): """ Align two AudioFileMFCC objects, representing WAVE files. Return a list of boundary indices. """ self.log(u"Creating DTWAligner...") aligner = DTWAligner(real_wave_mfcc, synt_wave_mfcc, rconf=self.rconf, logger=self.logger) self.log(u"Creating DTWAligner... done") self.log(u"Computing boundary indices...") boundary_indices = aligner.compute_boundaries(synt_anchors) self.log(u"Computing boundary indices... done") return boundary_indices
def test_compute_acm_real_mfcc(self): af = AudioFileMFCC(self.AUDIO_FILE) aligner = DTWAligner(real_wave_mfcc=af) with self.assertRaises(DTWAlignerNotInitialized): aligner.compute_accumulated_cost_matrix()
def test_compute_acm_synt_path(self): aligner = DTWAligner(synt_wave_path=self.AUDIO_FILE) with self.assertRaises(DTWAlignerNotInitialized): aligner.compute_accumulated_cost_matrix()
def test_compute_acm_none(self): aligner = DTWAligner() with self.assertRaises(DTWAlignerNotInitialized): aligner.compute_accumulated_cost_matrix()
def test_set_synt_wave_path(self): aligner = DTWAligner(synt_wave_path=self.AUDIO_FILE) self.assertIsNone(aligner.real_wave_mfcc) self.assertIsNotNone(aligner.synt_wave_path) self.assertIsNone(aligner.real_wave_path) self.assertIsNotNone(aligner.synt_wave_mfcc)
def test_create_aligner(self): aligner = DTWAligner() self.assertIsNone(aligner.real_wave_mfcc) self.assertIsNone(aligner.synt_wave_mfcc) self.assertIsNone(aligner.real_wave_path) self.assertIsNone(aligner.synt_wave_path)
def test_compute_path_synt_mfcc(self): af = AudioFileMFCC(self.AUDIO_FILE) aligner = DTWAligner(synt_wave_mfcc=af) with self.assertRaises(DTWAlignerNotInitialized): aligner.compute_path()
def test_compute_path_synt_path(self): aligner = DTWAligner(synt_wave_path=self.AUDIO_FILE) with self.assertRaises(DTWAlignerNotInitialized): aligner.compute_path()
def _detect_start(self, min_start_length, max_start_length, metric, backwards=False): """ Detect start """ self._log(["Min start length: %.3f", min_start_length]) self._log(["Max start length: %.3f", max_start_length]) self._log(["Metric: %s", metric]) self._log(["Backwards: %s", str(backwards)]) audio_rate = self.text_file.characters / self.audio_file.audio_length self._log(["Audio rate: %.3f", audio_rate]) self._log("Synthesizing query...") tmp_handler, tmp_file_path = tempfile.mkstemp(suffix=".wav", dir=gf.custom_tmp_dir()) synt = Synthesizer(logger=self.logger) synt_duration = max_start_length * self.QUERY_FACTOR self._log(["Synthesizing %.3f seconds", synt_duration]) result = synt.synthesize(self.text_file, tmp_file_path, quit_after=synt_duration, backwards=backwards) self._log("Synthesizing query... done") query_file = AudioFile(tmp_file_path) if backwards: self._log("Reversing query") query_file.reverse() self._log("Extracting MFCCs for query...") query_file.extract_mfcc(frame_rate=self.frame_rate) query_file.clear_data() self._log("Extracting MFCCs for query... done") self._log("Cleaning up...") self._cleanup(tmp_handler, tmp_file_path) self._log("Cleaning up... done") query_characters = result[2] query_len = query_file.audio_length query_mfcc = query_file.audio_mfcc query_rate = query_characters / query_len stretch_factor = max(1, query_rate / audio_rate) self._log(["Audio rate: %.3f", audio_rate]) self._log(["Query rate: %.3f", query_rate]) self._log(["Stretch factor: %.3f", stretch_factor]) audio_mfcc = self.audio_file.audio_mfcc self._log(["Actual audio has %d frames", audio_mfcc.shape[1]]) audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate) self._log(["Limiting audio to first %d frames", audio_mfcc_end_index]) audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1]) audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index] self._log(["Limited audio has %d frames", audio_mfcc.shape[1]]) l, o = audio_mfcc.shape l, n = query_mfcc.shape # minimum length of a matched interval in the real audio stretched_match_minimum_length = int(n * stretch_factor) self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)]) self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)]) self._log(["Stretch factor: %.3f", stretch_factor]) self._log( ["Required minimum length: %.3f", stretched_match_minimum_length]) self._log("Speech intervals:") for interval in self.audio_speech: self._log([ " %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1] ]) admissible_intervals = [ x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length)) ] self._log("AdmissibleSpeech intervals:") for interval in admissible_intervals: self._log([ " %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1] ]) candidates = [] runs_with_min_length = 0 runs_no_improvement = 0 runs_min_distortion = numpy.inf runs_min_value = numpy.inf for interval in admissible_intervals: if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT: self._log(" Breaking: too many runs without improvement") break if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH: self._log( " Breaking: too many runs with minimum required length") break start_time = interval[0] start_index = self._t2i(start_time) self._log([ "Evaluating interval starting at %d == %.3f ", start_index, start_time ]) if start_index > o: self._log(" Breaking: start index outside audio window") break req_end_index = start_index + stretched_match_minimum_length req_end_time = self._i2t(req_end_index) if req_end_index > o: self._log( " Breaking: not enough audio left in shifted window") break end_index = min(start_index + 2 * n, o) end_time = self._i2t(end_index) self._log([" Start %d == %.3f", start_index, start_time]) self._log([" Req end %d == %.3f", req_end_index, req_end_time]) self._log([" Eff end %d == %.3f", end_index, end_time]) audio_mfcc_sub = audio_mfcc[:, start_index:end_index] l, m = audio_mfcc_sub.shape self._log("Computing DTW...") aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger) aligner.real_wave_full_mfcc = audio_mfcc_sub aligner.synt_wave_full_mfcc = query_mfcc aligner.real_wave_length = self._i2t(m) aligner.synt_wave_length = self._i2t(n) acm = aligner.compute_accumulated_cost_matrix() # transpose, so we have an n x m accumulated cost matrix acm = acm.transpose() last_row = acm[-1, :] self._log("Computing DTW... done") # find the minimum, but its index must be >= stretched_match_minimum_length candidate_argmin_index = numpy.argmin( last_row[stretched_match_minimum_length:]) candidate_length_index = stretched_match_minimum_length + candidate_argmin_index candidate_length_time = self._i2t(candidate_length_index) candidate_value = last_row[candidate_length_index] candidate_end_index = start_index + candidate_length_index candidate_end_time = self._i2t(candidate_end_index) candidate_distortion = candidate_value / candidate_length_index # check if the candidate has minimum length if candidate_length_index == stretched_match_minimum_length: runs_with_min_length += 1 else: runs_with_min_length = 0 # check if the candidate improved the global minimum value if metric == SDMetric.VALUE: if candidate_value < runs_min_value: runs_min_value = candidate_value runs_no_improvement = 0 else: runs_no_improvement += 1 if metric == SDMetric.DISTORTION: if candidate_distortion < runs_min_distortion: runs_min_distortion = candidate_distortion runs_no_improvement = 0 else: runs_no_improvement += 1 # append to the list of candidates self._log([ " Interval start: %d == %.6f", start_index, start_time ]) self._log( [" Interval end: %d == %.6f", end_index, end_time]) self._log([ " Candidate start: %d == %.6f", start_index, start_time ]) self._log([ " Candidate end: %d == %.6f", candidate_end_index, candidate_end_time ]) self._log([ " Candidate length: %d == %.6f", candidate_length_index, candidate_length_time ]) self._log([" Candidate value: %.6f", candidate_value]) self._log([" Candidate distortion: %.6f", candidate_distortion]) candidates.append({ "start_index": start_index, "length": candidate_length_index, "value": candidate_value, "distortion": candidate_distortion }) # select best candidate and return its start time # if we have no best candidate, return 0.0 best_candidate = self._select_best_candidate(candidates, metric) if best_candidate is None: return 0.0 sd_time = self._i2t(max(best_candidate["start_index"], 0)) self._log(["Returning time %.3f", sd_time]) return sd_time
def _detect(self, min_length, max_length, tail=False): """ Detect the head or tail within ``min_length`` and ``max_length`` duration. If detecting the tail, the real wave MFCC and the query are reversed so that the tail detection problem reduces to a head detection problem. Return the duration of the head or tail, in seconds. :param min_length: estimated minimum length :type min_length: :class:`~aeneas.timevalue.TimeValue` :param max_length: estimated maximum length :type max_length: :class:`~aeneas.timevalue.TimeValue` :rtype: :class:`~aeneas.timevalue.TimeValue` :raises: TypeError: if one of the parameters is not ``None`` or a number :raises: ValueError: if one of the parameters is negative """ def _sanitize(value, default, name): if value is None: value = default try: value = TimeValue(value) except (TypeError, ValueError, InvalidOperation) as exc: self.log_exc(u"The value of %s is not a number" % (name), exc, True, TypeError) if value < 0: self.log_exc(u"The value of %s is negative" % (name), None, True, ValueError) return value min_length = _sanitize(min_length, self.MIN_LENGTH, "min_length") max_length = _sanitize(max_length, self.MAX_LENGTH, "max_length") mws = self.rconf.mws min_length_frames = int(min_length / mws) max_length_frames = int(max_length / mws) self.log([u"MFCC window shift s: %.3f", mws]) self.log([u"Min start length s: %.3f", min_length]) self.log([u"Min start length frames: %d", min_length_frames]) self.log([u"Max start length s: %.3f", max_length]) self.log([u"Max start length frames: %d", max_length_frames]) self.log([u"Tail?: %s", str(tail)]) self.log(u"Synthesizing query...") synt_duration = max_length * self.QUERY_FACTOR self.log([u"Synthesizing at least %.3f seconds", synt_duration]) tmp_handler, tmp_file_path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH]) synt = Synthesizer(rconf=self.rconf, logger=self.logger) anchors, total_time, synthesized_chars = synt.synthesize( self.text_file, tmp_file_path, quit_after=synt_duration, backwards=tail ) self.log(u"Synthesizing query... done") self.log(u"Extracting MFCCs for query...") query_mfcc = AudioFileMFCC(tmp_file_path, rconf=self.rconf, logger=self.logger) self.log(u"Extracting MFCCs for query... done") self.log(u"Cleaning up...") gf.delete_file(tmp_handler, tmp_file_path) self.log(u"Cleaning up... done") search_window = max_length * self.AUDIO_FACTOR search_window_end = min(int(search_window / mws), self.real_wave_mfcc.all_length) self.log([u"Query MFCC length (frames): %d", query_mfcc.all_length]) self.log([u"Real MFCC length (frames): %d", self.real_wave_mfcc.all_length]) self.log([u"Search window end (s): %.3f", search_window]) self.log([u"Search window end (frames): %d", search_window_end]) if tail: self.log(u"Tail => reversing real_wave_mfcc and query_mfcc") self.real_wave_mfcc.reverse() query_mfcc.reverse() # NOTE: VAD will be run here, if not done before speech_intervals = self.real_wave_mfcc.intervals(speech=True, time=False) if len(speech_intervals) < 1: self.log(u"No speech intervals, hence no start found") if tail: self.real_wave_mfcc.reverse() return TimeValue("0.000") # generate a list of begin indices search_end = None candidates_begin = [] for interval in speech_intervals: if (interval[0] >= min_length_frames) and (interval[0] <= max_length_frames): candidates_begin.append(interval[0]) search_end = interval[1] if search_end >= search_window_end: break # for each begin index, compute the acm cost # to match the query # note that we take the min over the last column of the acm # meaning that we allow to match the entire query wave # against a portion of the real wave candidates = [] for candidate_begin in candidates_begin: self.log([u"Candidate interval starting at %d == %.3f", candidate_begin, candidate_begin * mws]) try: rwm = AudioFileMFCC( mfcc_matrix=self.real_wave_mfcc.all_mfcc[:, candidate_begin:search_end], rconf=self.rconf, logger=self.logger ) dtw = DTWAligner( real_wave_mfcc=rwm, synt_wave_mfcc=query_mfcc, rconf=self.rconf, logger=self.logger ) acm = dtw.compute_accumulated_cost_matrix() last_column = acm[:, -1] min_value = numpy.min(last_column) min_index = numpy.argmin(last_column) self.log([u"Candidate interval: %d %d == %.3f %.3f", candidate_begin, search_end, candidate_begin * mws, search_end * mws]) self.log([u" Min value: %.6f", min_value]) self.log([u" Min index: %d == %.3f", min_index, min_index * mws]) candidates.append((min_value, candidate_begin, min_index)) except Exception as exc: self.log_exc(u"An unexpected error occurred while running _detect", exc, False, None) # reverse again the real wave if tail: self.log(u"Tail => reversing real_wave_mfcc again") self.real_wave_mfcc.reverse() # return if len(candidates) < 1: self.log(u"No candidates found") return TimeValue("0.000") self.log(u"Candidates:") for candidate in candidates: self.log([u" Value: %.6f Begin Time: %.3f Min Index: %d", candidate[0], candidate[1] * mws, candidate[2]]) best = sorted(candidates)[0][1] self.log([u"Best candidate: %d == %.3f", best, best * mws]) return best * mws