def _convert(self): """ Convert the audio file into a ``wav`` file. Return a triple: 1. a success bool flag 2. handler of the generated wave file 3. path of the generated wave file """ self._log("Converting real audio to wav") handler = None path = None try: self._log("Creating an output tempfile") handler, path = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) self._log("Creating a FFMPEGWrapper") ffmpeg = FFMPEGWrapper(logger=self.logger) self._log("Converting...") ffmpeg.convert( input_file_path=self.task.audio_file_path_absolute, output_file_path=path, head_length=self.task.configuration.is_audio_file_head_length, process_length=self.task.configuration.is_audio_file_process_length) self._log("Converting... done") self._log("Converting real audio to wav: succeeded") return (True, handler, path) except: self._log("Converting real audio to wav: failed") return (False, handler, path)
def _synthesize(self): """ Synthesize text into a ``wav`` file. Return a quadruple: 1. a success bool flag 2. handler of the generated wave file 3. path of the generated wave file 4. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` """ self._log("Synthesizing text") handler = None path = None anchors = None try: self._log("Creating an output tempfile") handler, path = tempfile.mkstemp(suffix=".wav", dir=gf.custom_tmp_dir()) self._log("Creating Synthesizer object") synt = Synthesizer(logger=self.logger) self._log("Synthesizing...") result = synt.synthesize(self.task.text_file, path) anchors = result[0] self._log("Synthesizing... done") self._log("Synthesizing text: succeeded") return (True, handler, path, anchors) except Exception as e: self._log("Synthesizing text: failed") self._log(["Message: %s", str(e)]) return (False, handler, path, anchors)
def _convert(self): """ Convert the entire audio file into a ``wav`` file. (Head/tail will be cut off later.) Return a triple: 1. a success bool flag 2. handler of the generated wave file 3. path of the generated wave file """ self._log("Converting real audio to wav") handler = None path = None try: self._log("Creating an output tempfile") handler, path = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) self._log("Creating a FFMPEGWrapper") ffmpeg = FFMPEGWrapper(logger=self.logger) self._log("Converting...") ffmpeg.convert( input_file_path=self.task.audio_file_path_absolute, output_file_path=path) self._log("Converting... done") self._log("Converting real audio to wav: succeeded") return (True, handler, path) except Exception as e: self._log("Converting real audio to wav: failed") self._log(["Message: %s", str(e)]) return (False, handler, path)
def _convert(self): """ Convert the entire audio file into a ``wav`` file. (Head/tail will be cut off later.) Return a triple: 1. a success bool flag 2. handler of the generated wave file 3. path of the generated wave file """ self._log("Converting real audio to wav") handler = None path = None try: self._log("Creating an output tempfile") handler, path = tempfile.mkstemp(suffix=".wav", dir=gf.custom_tmp_dir()) self._log("Creating a FFMPEGWrapper") ffmpeg = FFMPEGWrapper(logger=self.logger) self._log("Converting...") ffmpeg.convert(input_file_path=self.task.audio_file_path_absolute, output_file_path=path) self._log("Converting... done") self._log("Converting real audio to wav: succeeded") return (True, handler, path) except Exception as e: self._log("Converting real audio to wav: failed") self._log(["Message: %s", str(e)]) return (False, handler, path)
def _synthesize(self): """ Synthesize text into a ``wav`` file. Return a quadruple: 1. a success bool flag 2. handler of the generated wave file 3. path of the generated wave file 4. the list of anchors, that is, a list of floats each representing the start time of the corresponding text fragment in the generated wave file ``[start_1, start_2, ..., start_n]`` """ self._log("Synthesizing text") handler = None path = None anchors = None try: self._log("Creating an output tempfile") handler, path = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) self._log("Creating Synthesizer object") synt = Synthesizer(logger=self.logger) self._log("Synthesizing...") anchors = synt.synthesize(self.task.text_file, path) self._log("Synthesizing... done") self._log("Synthesizing text: succeeded") return (True, handler, path, anchors) except: self._log("Synthesizing text: failed") return (False, handler, path, anchors)
def synthesize(self, text_file, audio_file_path): """ Synthesize the text contained in the given fragment list into a ``wav`` file. :param text_file: the text file to be synthesized :type text_file: :class:`aeneas.textfile.TextFile` :param audio_file_path: the path to the output audio file :type audio_file_path: string (path) """ # time anchors anchors = [] # initialize time current_time = 0.0 # waves is used to concatenate all the fragments WAV files waves = numpy.array([]) # espeak wrapper espeak = ESPEAKWrapper(logger=self.logger) num = 0 # for each fragment, synthesize it and concatenate it for fragment in text_file.fragments: # synthesize and get the duration of the output file self._log("Synthesizing fragment %d" % num) handler, tmp_destination = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) duration = espeak.synthesize( text=fragment.text, language=fragment.language, output_file_path=tmp_destination ) # store for later output anchors.append([current_time, fragment.identifier, fragment.text]) # concatenate to buffer self._log("Fragment %d starts at: %f" % (num, current_time)) if duration > 0: self._log("Fragment %d duration: %f" % (num, duration)) current_time += duration data, sample_frequency, encoding = wavread(tmp_destination) # # TODO this might result in memory swapping # if we have a large number of fragments # is there a better way? # # waves = numpy.concatenate((waves, data)) # # append seems faster than concatenate, as it should waves = numpy.append(waves, data) else: self._log("Fragment %d has zero duration" % num) # remove temporary file self._log("Removing temporary file '%s'" % tmp_destination) os.close(handler) os.remove(tmp_destination) num += 1 # output WAV file, concatenation of synthesized fragments self._log("Writing audio file '%s'" % audio_file_path) wavwrite(waves, audio_file_path, sample_frequency, encoding) # return the time anchors self._log("Returning %d time anchors" % len(anchors)) return anchors
def _detect_start(self, min_start_length, max_start_length, metric, backwards=False): """ Detect start """ self._log(["Min start length: %.3f", min_start_length]) self._log(["Max start length: %.3f", max_start_length]) self._log(["Metric: %s", metric]) self._log(["Backwards: %s", str(backwards)]) audio_rate = self.text_file.characters / self.audio_file.audio_length self._log(["Audio rate: %.3f", audio_rate]) self._log("Synthesizing query...") tmp_handler, tmp_file_path = tempfile.mkstemp(suffix=".wav", dir=gf.custom_tmp_dir()) synt = Synthesizer(logger=self.logger) synt_duration = max_start_length * self.QUERY_FACTOR self._log(["Synthesizing %.3f seconds", synt_duration]) result = synt.synthesize(self.text_file, tmp_file_path, quit_after=synt_duration, backwards=backwards) self._log("Synthesizing query... done") query_file = AudioFile(tmp_file_path) if backwards: self._log("Reversing query") query_file.reverse() self._log("Extracting MFCCs for query...") query_file.extract_mfcc(frame_rate=self.frame_rate) query_file.clear_data() self._log("Extracting MFCCs for query... done") self._log("Cleaning up...") self._cleanup(tmp_handler, tmp_file_path) self._log("Cleaning up... done") query_characters = result[2] query_len = query_file.audio_length query_mfcc = query_file.audio_mfcc query_rate = query_characters / query_len stretch_factor = max(1, query_rate / audio_rate) self._log(["Audio rate: %.3f", audio_rate]) self._log(["Query rate: %.3f", query_rate]) self._log(["Stretch factor: %.3f", stretch_factor]) audio_mfcc = self.audio_file.audio_mfcc self._log(["Actual audio has %d frames", audio_mfcc.shape[1]]) audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate) self._log(["Limiting audio to first %d frames", audio_mfcc_end_index]) audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1]) audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index] self._log(["Limited audio has %d frames", audio_mfcc.shape[1]]) l, o = audio_mfcc.shape l, n = query_mfcc.shape # minimum length of a matched interval in the real audio stretched_match_minimum_length = int(n * stretch_factor) self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)]) self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)]) self._log(["Stretch factor: %.3f", stretch_factor]) self._log( ["Required minimum length: %.3f", stretched_match_minimum_length]) self._log("Speech intervals:") for interval in self.audio_speech: self._log([ " %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1] ]) admissible_intervals = [ x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length)) ] self._log("AdmissibleSpeech intervals:") for interval in admissible_intervals: self._log([ " %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1] ]) candidates = [] runs_with_min_length = 0 runs_no_improvement = 0 runs_min_distortion = numpy.inf runs_min_value = numpy.inf for interval in admissible_intervals: if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT: self._log(" Breaking: too many runs without improvement") break if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH: self._log( " Breaking: too many runs with minimum required length") break start_time = interval[0] start_index = self._t2i(start_time) self._log([ "Evaluating interval starting at %d == %.3f ", start_index, start_time ]) if start_index > o: self._log(" Breaking: start index outside audio window") break req_end_index = start_index + stretched_match_minimum_length req_end_time = self._i2t(req_end_index) if req_end_index > o: self._log( " Breaking: not enough audio left in shifted window") break end_index = min(start_index + 2 * n, o) end_time = self._i2t(end_index) self._log([" Start %d == %.3f", start_index, start_time]) self._log([" Req end %d == %.3f", req_end_index, req_end_time]) self._log([" Eff end %d == %.3f", end_index, end_time]) audio_mfcc_sub = audio_mfcc[:, start_index:end_index] l, m = audio_mfcc_sub.shape self._log("Computing DTW...") aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger) aligner.real_wave_full_mfcc = audio_mfcc_sub aligner.synt_wave_full_mfcc = query_mfcc aligner.real_wave_length = self._i2t(m) aligner.synt_wave_length = self._i2t(n) acm = aligner.compute_accumulated_cost_matrix() # transpose, so we have an n x m accumulated cost matrix acm = acm.transpose() last_row = acm[-1, :] self._log("Computing DTW... done") # find the minimum, but its index must be >= stretched_match_minimum_length candidate_argmin_index = numpy.argmin( last_row[stretched_match_minimum_length:]) candidate_length_index = stretched_match_minimum_length + candidate_argmin_index candidate_length_time = self._i2t(candidate_length_index) candidate_value = last_row[candidate_length_index] candidate_end_index = start_index + candidate_length_index candidate_end_time = self._i2t(candidate_end_index) candidate_distortion = candidate_value / candidate_length_index # check if the candidate has minimum length if candidate_length_index == stretched_match_minimum_length: runs_with_min_length += 1 else: runs_with_min_length = 0 # check if the candidate improved the global minimum value if metric == SDMetric.VALUE: if candidate_value < runs_min_value: runs_min_value = candidate_value runs_no_improvement = 0 else: runs_no_improvement += 1 if metric == SDMetric.DISTORTION: if candidate_distortion < runs_min_distortion: runs_min_distortion = candidate_distortion runs_no_improvement = 0 else: runs_no_improvement += 1 # append to the list of candidates self._log([ " Interval start: %d == %.6f", start_index, start_time ]) self._log( [" Interval end: %d == %.6f", end_index, end_time]) self._log([ " Candidate start: %d == %.6f", start_index, start_time ]) self._log([ " Candidate end: %d == %.6f", candidate_end_index, candidate_end_time ]) self._log([ " Candidate length: %d == %.6f", candidate_length_index, candidate_length_time ]) self._log([" Candidate value: %.6f", candidate_value]) self._log([" Candidate distortion: %.6f", candidate_distortion]) candidates.append({ "start_index": start_index, "length": candidate_length_index, "value": candidate_value, "distortion": candidate_distortion }) # select best candidate and return its start time # if we have no best candidate, return 0.0 best_candidate = self._select_best_candidate(candidates, metric) if best_candidate is None: return 0.0 sd_time = self._i2t(max(best_candidate["start_index"], 0)) self._log(["Returning time %.3f", sd_time]) return sd_time
def load_job_from_container(self, container_path, config_string=None): """ Validate the given container, and, if it is well formed, load the job from it. If ``config_string`` is ``None``, the container must contain a configuration file; otherwise use the provided config string (i.e., the wizard case). Return ``True`` if the job has been loaded successfully, ``False`` otherwise. :param container_path: the path to the input container :type container_path: string (path) :param config_string: the configuration string (from wizard) :type config_string: string :rtype: bool """ self._log("Loading job from container...") # validate container self._log("Validating container...") validator = Validator(logger=self.logger) if config_string == None: validator_result = validator.check_container(container_path) else: validator_result = validator.check_container_from_wizard( container_path, config_string ) if not validator_result.passed: self._log("Validating container: failed") self._log("Loading job from container: failed") return False self._log("Validating container: succeeded") try: # create working directory where the input container # will be decompressed self.working_directory = tempfile.mkdtemp(dir=gf.custom_tmp_dir()) self._log("Created working directory '%s'" % self.working_directory) # decompress self._log("Decompressing input container...") input_container = Container(container_path, logger=self.logger) input_container.decompress(self.working_directory) self._log("Decompressing input container... done") # create job from the working directory self._log("Creating job from working directory...") working_container = Container( self.working_directory, logger=self.logger ) analyzer = AnalyzeContainer(working_container, logger=self.logger) if config_string == None: self.job = analyzer.analyze() else: self.job = analyzer.analyze_from_wizard(config_string) self._log("Creating job from working directory... done") # set absolute path for text file and audio file # for each task in the job self._log("Setting absolute paths for tasks...") for task in self.job.tasks: task.text_file_path_absolute = gf.norm_join( self.working_directory, task.text_file_path ) task.audio_file_path_absolute = gf.norm_join( self.working_directory, task.audio_file_path ) self._log("Setting absolute paths for tasks... done") # return self._log("Loading job from container: succeeded") return True except: # failure: clean and return self.clean() self._log("Loading job from container: failed") return False
def write_output_container(self, output_directory_path): """ Write the output container for this job. Return a pair ``(bool, string)``, where the bool indicates whether the execution succeeded, and the string is the path to output container. :param output_directory_path: the path to a directory where the output container must be created :type output_directory_path: string (path) :rtype: (bool, string) """ self._log("Writing output container for this job") # check if the job has tasks if self.job == None: self._log("job is None") return (False, None) if len(self.job) == 0: self._log("The job has no tasks") return (False, None) try: # create temporary directory where the sync map files # will be created # this temporary directory will be compressed into # the output container self.tmp_directory = tempfile.mkdtemp(dir=gf.custom_tmp_dir()) self._log("Created temporary directory '%s'" % self.tmp_directory) for task in self.job.tasks: custom_id = task.configuration.custom_id # check if the task has sync map and sync map file path if task.sync_map_file_path == None: self._log("Task '%s' has sync_map_file_path not set" % custom_id) return (False, None) if task.sync_map == None: self._log("Task '%s' has sync_map not set" % custom_id) return (False, None) # output sync map self._log("Outputting sync map for task '%s'..." % custom_id) task.output_sync_map_file(self.tmp_directory) self._log("Outputting sync map for task '%s'... done" % custom_id) # get output container info output_container_format = self.job.configuration.os_container_format self._log("Output container format: '%s'" % output_container_format) output_file_name = self.job.configuration.os_file_name if ((output_container_format != ContainerFormat.UNPACKED) and (not output_file_name.endswith(output_container_format))): self._log("Adding extension to output_file_name") output_file_name += "." + output_container_format self._log("Output file name: '%s'" % output_file_name) output_file_path = gf.norm_join( output_directory_path, output_file_name ) self._log("Output file path: '%s'" % output_file_path) # create output container self._log("Compressing...") container = Container( output_file_path, output_container_format, logger=self.logger ) container.compress(self.tmp_directory) self._log("Compressing... done") self._log("Created output file: '%s'" % output_file_path) # clean and return self.clean(False) return (True, output_file_path) except: self.clean(False) return (False, None)
def load_job_from_container(self, container_path, config_string=None): """ Validate the given container, and, if it is well formed, load the job from it. If ``config_string`` is ``None``, the container must contain a configuration file; otherwise use the provided config string (i.e., the wizard case). Return ``True`` if the job has been loaded successfully, ``False`` otherwise. :param container_path: the path to the input container :type container_path: string (path) :param config_string: the configuration string (from wizard) :type config_string: string :rtype: bool """ self._log("Loading job from container...") # validate container self._log("Validating container...") validator = Validator(logger=self.logger) if config_string is None: validator_result = validator.check_container(container_path) else: validator_result = validator.check_container_from_wizard( container_path, config_string) if not validator_result.passed: self._log("Validating container: failed") self._log("Loading job from container: failed") return False self._log("Validating container: succeeded") try: # create working directory where the input container # will be decompressed self.working_directory = tempfile.mkdtemp(dir=gf.custom_tmp_dir()) self._log( ["Created working directory '%s'", self.working_directory]) # decompress self._log("Decompressing input container...") input_container = Container(container_path, logger=self.logger) input_container.decompress(self.working_directory) self._log("Decompressing input container... done") # create job from the working directory self._log("Creating job from working directory...") working_container = Container(self.working_directory, logger=self.logger) analyzer = AnalyzeContainer(working_container, logger=self.logger) if config_string is None: self.job = analyzer.analyze() else: self.job = analyzer.analyze_from_wizard(config_string) self._log("Creating job from working directory... done") # set absolute path for text file and audio file # for each task in the job self._log("Setting absolute paths for tasks...") for task in self.job.tasks: task.text_file_path_absolute = gf.norm_join( self.working_directory, task.text_file_path) task.audio_file_path_absolute = gf.norm_join( self.working_directory, task.audio_file_path) self._log("Setting absolute paths for tasks... done") # return self._log("Loading job from container: succeeded") return True except: # failure: clean and return self.clean() self._log("Loading job from container: failed") return False
def write_output_container(self, output_directory_path): """ Write the output container for this job. Return a pair ``(bool, string)``, where the bool indicates whether the execution succeeded, and the string is the path to output container. :param output_directory_path: the path to a directory where the output container must be created :type output_directory_path: string (path) :rtype: (bool, string) """ self._log("Writing output container for this job") # check if the job has tasks if self.job is None: self._log("job is None") return (False, None) if len(self.job) == 0: self._log("The job has no tasks") return (False, None) try: # create temporary directory where the sync map files # will be created # this temporary directory will be compressed into # the output container self.tmp_directory = tempfile.mkdtemp(dir=gf.custom_tmp_dir()) self._log(["Created temporary directory '%s'", self.tmp_directory]) for task in self.job.tasks: custom_id = task.configuration.custom_id # check if the task has sync map and sync map file path if task.sync_map_file_path is None: self._log([ "Task '%s' has sync_map_file_path not set", custom_id ]) return (False, None) if task.sync_map is None: self._log(["Task '%s' has sync_map not set", custom_id]) return (False, None) # output sync map self._log(["Outputting sync map for task '%s'...", custom_id]) task.output_sync_map_file(self.tmp_directory) self._log( ["Outputting sync map for task '%s'... done", custom_id]) # get output container info output_container_format = self.job.configuration.os_container_format self._log( ["Output container format: '%s'", output_container_format]) output_file_name = self.job.configuration.os_file_name if ((output_container_format != ContainerFormat.UNPACKED) and (not output_file_name.endswith(output_container_format))): self._log("Adding extension to output_file_name") output_file_name += "." + output_container_format self._log(["Output file name: '%s'", output_file_name]) output_file_path = gf.norm_join(output_directory_path, output_file_name) self._log(["Output file path: '%s'", output_file_path]) # create output container self._log("Compressing...") container = Container(output_file_path, output_container_format, logger=self.logger) container.compress(self.tmp_directory) self._log("Compressing... done") self._log(["Created output file: '%s'", output_file_path]) # clean and return self.clean(False) return (True, output_file_path) except: self.clean(False) return (False, None)
def main(): """ Entry point """ if len(sys.argv) < 5: usage() return language = sys.argv[1] text_file_path = sys.argv[2] text_format = sys.argv[3] audio_file_path = sys.argv[-1] verbose = False parameters = {} for i in range(4, len(sys.argv) - 1): args = sys.argv[i].split("=") if len(args) == 1: verbose = (args[0] in ["v", "-v", "verbose", "--verbose"]) if len(args) == 2: key, value = args if key == "id_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX] = value if key == "class_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX] = value if key == "sort": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT] = value if key == "min_head_length": parameters["min_head_length"] = float(value) if key == "max_head_length": parameters["max_head_length"] = float(value) if key == "min_tail_length": parameters["min_head_length"] = float(value) if key == "max_tail_length": parameters["max_tail_length"] = float(value) if not gf.can_run_c_extension(): print "[WARN] Unable to load Python C Extensions" print "[WARN] Running the slower pure Python code" print "[WARN] See the README file for directions to compile the Python C Extensions" logger = Logger(tee=verbose) print "[INFO] Reading audio..." tmp_handler, tmp_file_path = tempfile.mkstemp(suffix=".wav", dir=gf.custom_tmp_dir()) converter = FFMPEGWrapper(logger=logger) converter.convert(audio_file_path, tmp_file_path) audio_file = AudioFile(tmp_file_path) print "[INFO] Reading audio... done" print "[INFO] Reading text..." if text_format == "list": text_file = TextFile() text_file.read_from_list(text_file_path.split("|")) else: text_file = TextFile(text_file_path, text_format, parameters) text_file.set_language(language) print "[INFO] Reading text... done" print "[INFO] Detecting audio interval..." sd = SD(audio_file, text_file, logger=logger) min_head_length = gc.SD_MIN_HEAD_LENGTH if "min_head_length" in parameters: min_head_length = parameters["min_head_length"] max_head_length = gc.SD_MAX_HEAD_LENGTH if "max_head_length" in parameters: max_head_length = parameters["max_head_length"] min_tail_length = gc.SD_MIN_TAIL_LENGTH if "min_tail_length" in parameters: min_tail_length = parameters["min_tail_length"] max_tail_length = gc.SD_MAX_TAIL_LENGTH if "max_tail_length" in parameters: max_tail_length = parameters["max_tail_length"] start, end = sd.detect_interval(min_head_length, max_head_length, min_tail_length, max_tail_length) zero = 0 audio_len = audio_file.audio_length head_len = start text_len = end - start tail_len = audio_len - end print "[INFO] Detecting audio interval... done" print "[INFO] " print "[INFO] Head: %.3f %.3f (%.3f)" % (zero, start, head_len) print "[INFO] Text: %.3f %.3f (%.3f)" % (start, end, text_len) print "[INFO] Tail: %.3f %.3f (%.3f)" % (end, audio_len, tail_len) print "[INFO] " zero_h = gf.time_to_hhmmssmmm(0) start_h = gf.time_to_hhmmssmmm(start) end_h = gf.time_to_hhmmssmmm(end) audio_len_h = gf.time_to_hhmmssmmm(audio_len) head_len_h = gf.time_to_hhmmssmmm(head_len) text_len_h = gf.time_to_hhmmssmmm(text_len) tail_len_h = gf.time_to_hhmmssmmm(tail_len) print "[INFO] Head: %s %s (%s)" % (zero_h, start_h, head_len_h) print "[INFO] Text: %s %s (%s)" % (start_h, end_h, text_len_h) print "[INFO] Tail: %s %s (%s)" % (end_h, audio_len_h, tail_len_h) #print "[INFO] Cleaning up..." cleanup(tmp_handler, tmp_file_path)
def synthesize(self, text_file, audio_file_path, quit_after=None, backwards=False): """ Synthesize the text contained in the given fragment list into a ``wav`` file. :param text_file: the text file to be synthesized :type text_file: :class:`aeneas.textfile.TextFile` :param audio_file_path: the path to the output audio file :type audio_file_path: string (path) :param quit_after: stop synthesizing as soon as reaching this many seconds :type quit_after: float :param backwards: synthesizing from the end of the text file :type backwards: bool """ # time anchors anchors = [] # initialize time current_time = 0.0 # waves is used to concatenate all the fragments WAV files waves = numpy.array([]) # espeak wrapper espeak = ESPEAKWrapper(logger=self.logger) if quit_after is not None: self._log(["Quit after reaching %.3f", quit_after]) if backwards: self._log("Synthesizing backwards") # for each fragment, synthesize it and concatenate it num = 0 num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] for fragment in fragments: # synthesize and get the duration of the output file self._log(["Synthesizing fragment %d", num]) handler, tmp_destination = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir()) duration = espeak.synthesize(text=fragment.text, language=fragment.language, output_file_path=tmp_destination) # store for later output anchors.append([current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # concatenate to buffer self._log(["Fragment %d starts at: %f", num, current_time]) if duration > 0: self._log(["Fragment %d duration: %f", num, duration]) current_time += duration data, sample_frequency, encoding = wavread(tmp_destination) # # TODO this might result in memory swapping # if we have a large number of fragments # is there a better way? # # NOTE since append cannot be in place, # it seems that the only alternative is pre-allocating # the destination array, # possibly truncating or extending it as needed # if backwards: waves = numpy.append(data, waves) else: waves = numpy.append(waves, data) else: self._log(["Fragment %d has zero duration", num]) # remove temporary file self._log(["Removing temporary file '%s'", tmp_destination]) os.close(handler) os.remove(tmp_destination) num += 1 if (quit_after is not None) and (current_time > quit_after): self._log( ["Quitting after reached duration %.3f", current_time]) break # output WAV file, concatenation of synthesized fragments self._log(["Writing audio file '%s'", audio_file_path]) wavwrite(waves, audio_file_path, sample_frequency, encoding) # return the time anchors # TODO anchors do not make sense if backwards == True self._log(["Returning %d time anchors", len(anchors)]) self._log(["Current time %.3f", current_time]) self._log(["Synthesized %d characters", num_chars]) return (anchors, current_time, num_chars)
def main(): """ Entry point """ if len(sys.argv) < 5: usage() return language = sys.argv[1] text_file_path = sys.argv[2] text_format = sys.argv[3] audio_file_path = sys.argv[-1] verbose = False parameters = {} for i in range(4, len(sys.argv)-1): args = sys.argv[i].split("=") if len(args) == 1: verbose = (args[0] in ["v", "-v", "verbose", "--verbose"]) if len(args) == 2: key, value = args if key == "id_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX] = value if key == "class_regex": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX] = value if key == "sort": parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT] = value if key == "min_head_length": parameters["min_head_length"] = float(value) if key == "max_head_length": parameters["max_head_length"] = float(value) if key == "min_tail_length": parameters["min_head_length"] = float(value) if key == "max_tail_length": parameters["max_tail_length"] = float(value) if not gf.can_run_c_extension(): print "[WARN] Unable to load Python C Extensions" print "[WARN] Running the slower pure Python code" print "[WARN] See the README file for directions to compile the Python C Extensions" logger = Logger(tee=verbose) print "[INFO] Reading audio..." tmp_handler, tmp_file_path = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) converter = FFMPEGWrapper(logger=logger) converter.convert(audio_file_path, tmp_file_path) audio_file = AudioFile(tmp_file_path) print "[INFO] Reading audio... done" print "[INFO] Reading text..." if text_format == "list": text_file = TextFile() text_file.read_from_list(text_file_path.split("|")) else: text_file = TextFile(text_file_path, text_format, parameters) text_file.set_language(language) print "[INFO] Reading text... done" print "[INFO] Detecting audio interval..." sd = SD(audio_file, text_file, logger=logger) min_head_length = gc.SD_MIN_HEAD_LENGTH if "min_head_length" in parameters: min_head_length = parameters["min_head_length"] max_head_length = gc.SD_MAX_HEAD_LENGTH if "max_head_length" in parameters: max_head_length = parameters["max_head_length"] min_tail_length = gc.SD_MIN_TAIL_LENGTH if "min_tail_length" in parameters: min_tail_length = parameters["min_tail_length"] max_tail_length = gc.SD_MAX_TAIL_LENGTH if "max_tail_length" in parameters: max_tail_length = parameters["max_tail_length"] start, end = sd.detect_interval( min_head_length, max_head_length, min_tail_length, max_tail_length ) zero = 0 audio_len = audio_file.audio_length head_len = start text_len = end - start tail_len = audio_len - end print "[INFO] Detecting audio interval... done" print "[INFO] " print "[INFO] Head: %.3f %.3f (%.3f)" % (zero, start, head_len) print "[INFO] Text: %.3f %.3f (%.3f)" % (start, end, text_len) print "[INFO] Tail: %.3f %.3f (%.3f)" % (end, audio_len, tail_len) print "[INFO] " zero_h = gf.time_to_hhmmssmmm(0) start_h = gf.time_to_hhmmssmmm(start) end_h = gf.time_to_hhmmssmmm(end) audio_len_h = gf.time_to_hhmmssmmm(audio_len) head_len_h = gf.time_to_hhmmssmmm(head_len) text_len_h = gf.time_to_hhmmssmmm(text_len) tail_len_h = gf.time_to_hhmmssmmm(tail_len) print "[INFO] Head: %s %s (%s)" % (zero_h, start_h, head_len_h) print "[INFO] Text: %s %s (%s)" % (start_h, end_h, text_len_h) print "[INFO] Tail: %s %s (%s)" % (end_h, audio_len_h, tail_len_h) #print "[INFO] Cleaning up..." cleanup(tmp_handler, tmp_file_path)
def test_custom_tmp_dir(self): tmp_dir = gf.custom_tmp_dir() if sys.platform in ["linux", "linux2", "darwin"]: self.assertEqual(tmp_dir, gc.TMP_PATH_DEFAULT_POSIX) else: self.assertEqual(tmp_dir, gc.TMP_PATH_DEFAULT_NONPOSIX)
def synthesize(self, text_file, audio_file_path, quit_after=None, backwards=False): """ Synthesize the text contained in the given fragment list into a ``wav`` file. :param text_file: the text file to be synthesized :type text_file: :class:`aeneas.textfile.TextFile` :param audio_file_path: the path to the output audio file :type audio_file_path: string (path) :param quit_after: stop synthesizing as soon as reaching this many seconds :type quit_after: float :param backwards: synthesizing from the end of the text file :type backwards: bool """ # time anchors anchors = [] # initialize time current_time = 0.0 # waves is used to concatenate all the fragments WAV files waves = numpy.array([]) # espeak wrapper espeak = ESPEAKWrapper(logger=self.logger) if quit_after is not None: self._log(["Quit after reaching %.3f", quit_after]) if backwards: self._log("Synthesizing backwards") # for each fragment, synthesize it and concatenate it num = 0 num_chars = 0 fragments = text_file.fragments if backwards: fragments = fragments[::-1] for fragment in fragments: # synthesize and get the duration of the output file self._log(["Synthesizing fragment %d", num]) handler, tmp_destination = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) duration = espeak.synthesize( text=fragment.text, language=fragment.language, output_file_path=tmp_destination ) # store for later output anchors.append([current_time, fragment.identifier, fragment.text]) # increase the character counter num_chars += fragment.characters # concatenate to buffer self._log(["Fragment %d starts at: %f", num, current_time]) if duration > 0: self._log(["Fragment %d duration: %f", num, duration]) current_time += duration data, sample_frequency, encoding = wavread(tmp_destination) # # TODO this might result in memory swapping # if we have a large number of fragments # is there a better way? # # NOTE since append cannot be in place, # it seems that the only alternative is pre-allocating # the destination array, # possibly truncating or extending it as needed # if backwards: waves = numpy.append(data, waves) else: waves = numpy.append(waves, data) else: self._log(["Fragment %d has zero duration", num]) # remove temporary file self._log(["Removing temporary file '%s'", tmp_destination]) os.close(handler) os.remove(tmp_destination) num += 1 if (quit_after is not None) and (current_time > quit_after): self._log(["Quitting after reached duration %.3f", current_time]) break # output WAV file, concatenation of synthesized fragments self._log(["Writing audio file '%s'", audio_file_path]) wavwrite(waves, audio_file_path, sample_frequency, encoding) # return the time anchors # TODO anchors do not make sense if backwards == True self._log(["Returning %d time anchors", len(anchors)]) self._log(["Current time %.3f", current_time]) self._log(["Synthesized %d characters", num_chars]) return (anchors, current_time, num_chars)
def _detect_start(self, min_start_length, max_start_length, metric, backwards=False): """ Detect start """ self._log(["Min start length: %.3f", min_start_length]) self._log(["Max start length: %.3f", max_start_length]) self._log(["Metric: %s", metric]) self._log(["Backwards: %s", str(backwards)]) audio_rate = self.text_file.characters / self.audio_file.audio_length self._log(["Audio rate: %.3f", audio_rate]) self._log("Synthesizing query...") tmp_handler, tmp_file_path = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) synt = Synthesizer(logger=self.logger) synt_duration = max_start_length * self.QUERY_FACTOR self._log(["Synthesizing %.3f seconds", synt_duration]) result = synt.synthesize( self.text_file, tmp_file_path, quit_after=synt_duration, backwards=backwards ) self._log("Synthesizing query... done") query_file = AudioFile(tmp_file_path) if backwards: self._log("Reversing query") query_file.reverse() self._log("Extracting MFCCs for query...") query_file.extract_mfcc(frame_rate=self.frame_rate) query_file.clear_data() self._log("Extracting MFCCs for query... done") self._log("Cleaning up...") self._cleanup(tmp_handler, tmp_file_path) self._log("Cleaning up... done") query_characters = result[2] query_len = query_file.audio_length query_mfcc = query_file.audio_mfcc query_rate = query_characters / query_len stretch_factor = max(1, query_rate / audio_rate) self._log(["Audio rate: %.3f", audio_rate]) self._log(["Query rate: %.3f", query_rate]) self._log(["Stretch factor: %.3f", stretch_factor]) audio_mfcc = self.audio_file.audio_mfcc self._log(["Actual audio has %d frames", audio_mfcc.shape[1]]) audio_mfcc_end_index = int(max_start_length * self.AUDIO_FACTOR * self.frame_rate) self._log(["Limiting audio to first %d frames", audio_mfcc_end_index]) audio_mfcc_end_index = min(audio_mfcc_end_index, audio_mfcc.shape[1]) audio_mfcc = audio_mfcc[:, 0:audio_mfcc_end_index] self._log(["Limited audio has %d frames", audio_mfcc.shape[1]]) l, o = audio_mfcc.shape l, n = query_mfcc.shape # minimum length of a matched interval in the real audio stretched_match_minimum_length = int(n * stretch_factor) self._log(["Audio has %d frames == %.3f seconds", o, self._i2t(o)]) self._log(["Query has %d frames == %.3f seconds", n, self._i2t(n)]) self._log(["Stretch factor: %.3f", stretch_factor]) self._log(["Required minimum length: %.3f", stretched_match_minimum_length]) self._log("Speech intervals:") for interval in self.audio_speech: self._log([" %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]]) admissible_intervals = [x for x in self.audio_speech if ((x[0] >= min_start_length) and (x[0] <= max_start_length))] self._log("AdmissibleSpeech intervals:") for interval in admissible_intervals: self._log([" %d %d == %.3f %.3f", self._t2i(interval[0]), self._t2i(interval[1]), interval[0], interval[1]]) candidates = [] runs_with_min_length = 0 runs_no_improvement = 0 runs_min_distortion = numpy.inf runs_min_value = numpy.inf for interval in admissible_intervals: if runs_no_improvement >= self.MAX_RUNS_NO_IMPROVEMENT: self._log(" Breaking: too many runs without improvement") break if runs_with_min_length >= self.MAX_RUNS_WITH_MIN_LENGTH: self._log(" Breaking: too many runs with minimum required length") break start_time = interval[0] start_index = self._t2i(start_time) self._log(["Evaluating interval starting at %d == %.3f ", start_index, start_time]) if start_index > o: self._log(" Breaking: start index outside audio window") break req_end_index = start_index + stretched_match_minimum_length req_end_time = self._i2t(req_end_index) if req_end_index > o: self._log(" Breaking: not enough audio left in shifted window") break end_index = min(start_index + 2 * n, o) end_time = self._i2t(end_index) self._log([" Start %d == %.3f", start_index, start_time]) self._log([" Req end %d == %.3f", req_end_index, req_end_time]) self._log([" Eff end %d == %.3f", end_index, end_time]) audio_mfcc_sub = audio_mfcc[:, start_index:end_index] l, m = audio_mfcc_sub.shape self._log("Computing DTW...") aligner = DTWAligner(None, None, frame_rate=self.frame_rate, logger=self.logger) aligner.real_wave_full_mfcc = audio_mfcc_sub aligner.synt_wave_full_mfcc = query_mfcc aligner.real_wave_length = self._i2t(m) aligner.synt_wave_length = self._i2t(n) acm = aligner.compute_accumulated_cost_matrix() # transpose, so we have an n x m accumulated cost matrix acm = acm.transpose() last_row = acm[-1, :] self._log("Computing DTW... done") # find the minimum, but its index must be >= stretched_match_minimum_length candidate_argmin_index = numpy.argmin(last_row[stretched_match_minimum_length:]) candidate_length_index = stretched_match_minimum_length + candidate_argmin_index candidate_length_time = self._i2t(candidate_length_index) candidate_value = last_row[candidate_length_index] candidate_end_index = start_index + candidate_length_index candidate_end_time = self._i2t(candidate_end_index) candidate_distortion = candidate_value / candidate_length_index # check if the candidate has minimum length if candidate_length_index == stretched_match_minimum_length: runs_with_min_length += 1 else: runs_with_min_length = 0 # check if the candidate improved the global minimum value if metric == SDMetric.VALUE: if candidate_value < runs_min_value: runs_min_value = candidate_value runs_no_improvement = 0 else: runs_no_improvement += 1 if metric == SDMetric.DISTORTION: if candidate_distortion < runs_min_distortion: runs_min_distortion = candidate_distortion runs_no_improvement = 0 else: runs_no_improvement += 1 # append to the list of candidates self._log([" Interval start: %d == %.6f", start_index, start_time]) self._log([" Interval end: %d == %.6f", end_index, end_time]) self._log([" Candidate start: %d == %.6f", start_index, start_time]) self._log([" Candidate end: %d == %.6f", candidate_end_index, candidate_end_time]) self._log([" Candidate length: %d == %.6f", candidate_length_index, candidate_length_time]) self._log([" Candidate value: %.6f", candidate_value]) self._log([" Candidate distortion: %.6f", candidate_distortion]) candidates.append({ "start_index": start_index, "length": candidate_length_index, "value": candidate_value, "distortion": candidate_distortion }) # select best candidate and return its start time # if we have no best candidate, return 0.0 best_candidate = self._select_best_candidate(candidates, metric) if best_candidate is None: return 0.0 sd_time = self._i2t(max(best_candidate["start_index"], 0)) self._log(["Returning time %.3f", sd_time]) return sd_time
def main(): """ Entry point """ if len(sys.argv) < 4: usage() return audio_file_path = sys.argv[1] tmp_handler, tmp_file_path = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) mode = sys.argv[2] output_file_path = sys.argv[3] verbose = (sys.argv[-1] == "-v") if mode not in ["speech", "nonspeech", "both"]: usage() return if not gf.can_run_c_extension(): print "[WARN] Unable to load Python C Extensions" print "[WARN] Running the slower pure Python code" print "[WARN] See the README file for directions to compile the Python C Extensions" logger = Logger(tee=verbose) print "[INFO] Converting audio file to mono..." converter = FFMPEGWrapper(logger=logger) converter.convert(audio_file_path, tmp_file_path) print "[INFO] Converting audio file to mono... done" vad = VAD(tmp_file_path, logger=logger) print "[INFO] Extracting MFCCs..." vad.compute_mfcc() print "[INFO] Extracting MFCCs... done" print "[INFO] Executing VAD..." vad.compute_vad() print "[INFO] Executing VAD... done" print "[INFO] Cleaning up..." cleanup(tmp_handler, tmp_file_path) print "[INFO] Cleaning up... done" if mode == "speech": print "[INFO] Creating speech file..." output_file = open(output_file_path, "w") for interval in vad.speech: output_file.write("%.3f\t%.3f\n" % (interval[0], interval[1])) output_file.close() print "[INFO] Creating speech file... done" if mode == "nonspeech": print "[INFO] Creating nonspeech file..." output_file = open(output_file_path, "w") for interval in vad.nonspeech: output_file.write("%.3f\t%.3f\n" % (interval[0], interval[1])) output_file.close() print "[INFO] Creating nonspeech file... done" if mode == "both": print "[INFO] Creating speech and nonspeech file..." output_file = open(output_file_path, "w") speech = [[x[0], x[1], "speech"] for x in vad.speech] nonspeech = [[x[0], x[1], "nonspeech"] for x in vad.nonspeech] both = sorted(speech + nonspeech) for interval in both: output_file.write("%.3f\t%.3f\t%s\n" % ( interval[0], interval[1], interval[2] )) output_file.close() print "[INFO] Creating speech and nonspeech file... done" print "[INFO] Created file %s" % output_file_path