def run_vad(self): """ Determine which frames contain speech and nonspeech, and store the resulting boolean mask internally. """ def _compute_runs(array): """ Compute runs as a list of arrays, each containing the indices of a contiguous run. :param array: the data array :type array: :class:`numpy.ndarray` (1D) :rtype: list of :class:`numpy.ndarray` (1D) """ if len(array) < 1: return [] return numpy.split(array, numpy.where(numpy.diff(array) != 1)[0] + 1) self.log(u"Creating VAD object") vad = VAD(rconf=self.rconf, logger=self.logger) self.log(u"Running VAD...") self.__mfcc_mask = vad.run_vad(self.__mfcc[0]) self.__mfcc_mask_map = (numpy.where(self.__mfcc_mask))[0] self.log(u"Running VAD... done") self.log(u"Storing speech and nonspeech intervals...") # where( == True) already computed, reusing #runs = _compute_runs((numpy.where(self.__mfcc_mask))[0]) runs = _compute_runs(self.__mfcc_mask_map) self.__speech_intervals = [(r[0], r[-1]) for r in runs] # where( == False) not already computed, computing now runs = _compute_runs((numpy.where(~self.__mfcc_mask))[0]) self.__nonspeech_intervals = [(r[0], r[-1]) for r in runs] self.log(u"Storing speech and nonspeech intervals... done")
def run_vad(self): """ Determine which frames contain speech and nonspeech, and store the resulting boolean mask internally. """ def _compute_runs(array): """ Compute runs as a list of arrays, each containing the indices of a contiguous run. :param array: the data array :type array: :class:`numpy.ndarray` (1D) :rtype: list of :class:`numpy.ndarray` (1D) """ if len(array) < 1: return [] return numpy.split(array, numpy.where(numpy.diff(array) != 1)[0] + 1) self.log(u"Creating VAD object") vad = VAD(rconf=self.rconf, logger=self.logger) self.log(u"Running VAD...") self.__mfcc_mask = vad.run_vad(self.__mfcc[0]) self.__mfcc_mask_map = (numpy.where(self.__mfcc_mask))[0] self.log(u"Running VAD... done") self.log(u"Storing speech and nonspeech intervals...") # where( == True) already computed, reusing # COMMENTED runs = _compute_runs((numpy.where(self.__mfcc_mask))[0]) runs = _compute_runs(self.__mfcc_mask_map) self.__speech_intervals = [(r[0], r[-1]) for r in runs] # where( == False) not already computed, computing now runs = _compute_runs((numpy.where(~self.__mfcc_mask))[0]) self.__nonspeech_intervals = [(r[0], r[-1]) for r in runs] self.log(u"Storing speech and nonspeech intervals... done")
def _extract_speech(self): """ Extract speech intervals """ self._log("Running VAD...") vad = VAD(frame_rate=self.frame_rate, logger=self.logger) vad.wave_len = self.audio_file.audio_length vad.wave_mfcc = self.audio_file.audio_mfcc vad.compute_vad() self.audio_speech = vad.speech self._log("Running VAD... done")
def run_vad( self, log_energy_threshold=None, min_nonspeech_length=None, extend_before=None, extend_after=None ): """ Determine which frames contain speech and nonspeech, and store the resulting boolean mask internally. The four parameters might be ``None``: in this case, the corresponding RuntimeConfiguration values are applied. :param float log_energy_threshold: the minimum log energy threshold to consider a frame as speech :param int min_nonspeech_length: the minimum length, in frames, of a nonspeech interval :param int extend_before: extend each speech interval by this number of frames to the left (before) :param int extend_after: extend each speech interval by this number of frames to the right (after) """ def _compute_runs(array): """ Compute runs as a list of arrays, each containing the indices of a contiguous run. :param array: the data array :type array: :class:`numpy.ndarray` (1D) :rtype: list of :class:`numpy.ndarray` (1D) """ if len(array) < 1: return [] return numpy.split(array, numpy.where(numpy.diff(array) != 1)[0] + 1) self.log(u"Creating VAD object") vad = VAD(rconf=self.rconf, logger=self.logger) self.log(u"Running VAD...") self.__mfcc_mask = vad.run_vad( wave_energy=self.__mfcc[0], log_energy_threshold=log_energy_threshold, min_nonspeech_length=min_nonspeech_length, extend_before=extend_before, extend_after=extend_after ) self.__mfcc_mask_map = (numpy.where(self.__mfcc_mask))[0] self.log(u"Running VAD... done") self.log(u"Storing speech and nonspeech intervals...") # where( == True) already computed, reusing # COMMENTED runs = _compute_runs((numpy.where(self.__mfcc_mask))[0]) runs = _compute_runs(self.__mfcc_mask_map) self.__speech_intervals = [(r[0], r[-1]) for r in runs] # where( == False) not already computed, computing now runs = _compute_runs((numpy.where(~self.__mfcc_mask))[0]) self.__nonspeech_intervals = [(r[0], r[-1]) for r in runs] self.log(u"Storing speech and nonspeech intervals... done")
def run_vad(self, log_energy_threshold=None, min_nonspeech_length=None, extend_before=None, extend_after=None): """ Determine which frames contain speech and nonspeech, and store the resulting boolean mask internally. The four parameters might be ``None``: in this case, the corresponding RuntimeConfiguration values are applied. :param float log_energy_threshold: the minimum log energy threshold to consider a frame as speech :param int min_nonspeech_length: the minimum length, in frames, of a nonspeech interval :param int extend_before: extend each speech interval by this number of frames to the left (before) :param int extend_after: extend each speech interval by this number of frames to the right (after) """ def _compute_runs(array): """ Compute runs as a list of arrays, each containing the indices of a contiguous run. :param array: the data array :type array: :class:`numpy.ndarray` (1D) :rtype: list of :class:`numpy.ndarray` (1D) """ if len(array) < 1: return [] return numpy.split(array, numpy.where(numpy.diff(array) != 1)[0] + 1) self.log(u"Creating VAD object") vad = VAD(rconf=self.rconf, logger=self.logger) self.log(u"Running VAD...") self.__mfcc_mask = vad.run_vad( wave_energy=self.__mfcc[0], log_energy_threshold=log_energy_threshold, min_nonspeech_length=min_nonspeech_length, extend_before=extend_before, extend_after=extend_after) self.__mfcc_mask_map = (numpy.where(self.__mfcc_mask))[0] self.log(u"Running VAD... done") self.log(u"Storing speech and nonspeech intervals...") # where( == True) already computed, reusing # COMMENTED runs = _compute_runs((numpy.where(self.__mfcc_mask))[0]) runs = _compute_runs(self.__mfcc_mask_map) self.__speech_intervals = [(r[0], r[-1]) for r in runs] # where( == False) not already computed, computing now runs = _compute_runs((numpy.where(~self.__mfcc_mask))[0]) self.__nonspeech_intervals = [(r[0], r[-1]) for r in runs] self.log(u"Storing speech and nonspeech intervals... done")
def _adjust_boundaries(self, text_map, real_wave_full_mfcc, real_wave_length): """ Adjust the boundaries between consecutive fragments. Return a pair: 1. a success bool flag 2. the computed interval map, that is, a list of triples ``[start_time, end_time, fragment_id]`` """ self._log("Adjusting boundaries") algo = self.task.configuration.adjust_boundary_algorithm value = None if algo is None: self._log("No adjust boundary algorithm specified: returning") return (True, text_map) elif algo == AdjustBoundaryAlgorithm.AUTO: self._log("Requested adjust boundary algorithm AUTO: returning") return (True, text_map) elif algo == AdjustBoundaryAlgorithm.AFTERCURRENT: value = self.task.configuration.adjust_boundary_aftercurrent_value elif algo == AdjustBoundaryAlgorithm.BEFORENEXT: value = self.task.configuration.adjust_boundary_beforenext_value elif algo == AdjustBoundaryAlgorithm.OFFSET: value = self.task.configuration.adjust_boundary_offset_value elif algo == AdjustBoundaryAlgorithm.PERCENT: value = self.task.configuration.adjust_boundary_percent_value elif algo == AdjustBoundaryAlgorithm.RATE: value = self.task.configuration.adjust_boundary_rate_value elif algo == AdjustBoundaryAlgorithm.RATEAGGRESSIVE: value = self.task.configuration.adjust_boundary_rate_value self._log(["Requested algo %s and value %s", algo, value]) try: self._log("Running VAD...") vad = VAD(logger=self.logger) vad.wave_mfcc = real_wave_full_mfcc vad.wave_len = real_wave_length vad.compute_vad() self._log("Running VAD... done") except Exception as e: self._log("Adjusting boundaries: failed") self._log(["Message: %s", str(e)]) return (False, None) self._log("Creating AdjustBoundaryAlgorithm object") adjust_boundary = AdjustBoundaryAlgorithm(algorithm=algo, text_map=text_map, speech=vad.speech, nonspeech=vad.nonspeech, value=value, logger=self.logger) self._log("Adjusting boundaries...") adjusted_map = adjust_boundary.adjust() self._log("Adjusting boundaries... done") self._log("Adjusting boundaries: succeeded") return (True, adjusted_map)
def perform(self, input_file_path, speech_length, nonspeech_length): vad = VAD(get_abs_path(input_file_path)) vad.compute_mfcc() vad.compute_vad() self.assertEqual(len(vad.speech), speech_length) self.assertEqual(len(vad.nonspeech), nonspeech_length)
def _adjust_boundaries( self, text_map, real_wave_full_mfcc, real_wave_length ): """ Adjust the boundaries between consecutive fragments. Return a pair: 1. a success bool flag 2. the computed interval map, that is, a list of triples ``[start_time, end_time, fragment_id]`` """ self._log("Adjusting boundaries") algo = self.task.configuration.adjust_boundary_algorithm value = None if algo is None: self._log("No adjust boundary algorithm specified: returning") return (True, text_map) elif algo == AdjustBoundaryAlgorithm.AUTO: self._log("Requested adjust boundary algorithm AUTO: returning") return (True, text_map) elif algo == AdjustBoundaryAlgorithm.AFTERCURRENT: value = self.task.configuration.adjust_boundary_aftercurrent_value elif algo == AdjustBoundaryAlgorithm.BEFORENEXT: value = self.task.configuration.adjust_boundary_beforenext_value elif algo == AdjustBoundaryAlgorithm.OFFSET: value = self.task.configuration.adjust_boundary_offset_value elif algo == AdjustBoundaryAlgorithm.PERCENT: value = self.task.configuration.adjust_boundary_percent_value elif algo == AdjustBoundaryAlgorithm.RATE: value = self.task.configuration.adjust_boundary_rate_value elif algo == AdjustBoundaryAlgorithm.RATEAGGRESSIVE: value = self.task.configuration.adjust_boundary_rate_value self._log(["Requested algo %s and value %s", algo, value]) try: self._log("Running VAD...") vad = VAD(logger=self.logger) vad.wave_mfcc = real_wave_full_mfcc vad.wave_len = real_wave_length vad.compute_vad() self._log("Running VAD... done") except Exception as e: self._log("Adjusting boundaries: failed") self._log(["Message: %s", str(e)]) return (False, None) self._log("Creating AdjustBoundaryAlgorithm object") adjust_boundary = AdjustBoundaryAlgorithm( algorithm=algo, text_map=text_map, speech=vad.speech, nonspeech=vad.nonspeech, value=value, logger=self.logger ) self._log("Adjusting boundaries...") adjusted_map = adjust_boundary.adjust() self._log("Adjusting boundaries... done") self._log("Adjusting boundaries: succeeded") return (True, adjusted_map)
def main(): """ Entry point """ if len(sys.argv) < 4: usage() return audio_file_path = sys.argv[1] tmp_handler, tmp_file_path = tempfile.mkstemp( suffix=".wav", dir=gf.custom_tmp_dir() ) mode = sys.argv[2] output_file_path = sys.argv[3] verbose = (sys.argv[-1] == "-v") if mode not in ["speech", "nonspeech", "both"]: usage() return if not gf.can_run_c_extension(): print "[WARN] Unable to load Python C Extensions" print "[WARN] Running the slower pure Python code" print "[WARN] See the README file for directions to compile the Python C Extensions" logger = Logger(tee=verbose) print "[INFO] Converting audio file to mono..." converter = FFMPEGWrapper(logger=logger) converter.convert(audio_file_path, tmp_file_path) print "[INFO] Converting audio file to mono... done" vad = VAD(tmp_file_path, logger=logger) print "[INFO] Extracting MFCCs..." vad.compute_mfcc() print "[INFO] Extracting MFCCs... done" print "[INFO] Executing VAD..." vad.compute_vad() print "[INFO] Executing VAD... done" print "[INFO] Cleaning up..." cleanup(tmp_handler, tmp_file_path) print "[INFO] Cleaning up... done" if mode == "speech": print "[INFO] Creating speech file..." output_file = open(output_file_path, "w") for interval in vad.speech: output_file.write("%.3f\t%.3f\n" % (interval[0], interval[1])) output_file.close() print "[INFO] Creating speech file... done" if mode == "nonspeech": print "[INFO] Creating nonspeech file..." output_file = open(output_file_path, "w") for interval in vad.nonspeech: output_file.write("%.3f\t%.3f\n" % (interval[0], interval[1])) output_file.close() print "[INFO] Creating nonspeech file... done" if mode == "both": print "[INFO] Creating speech and nonspeech file..." output_file = open(output_file_path, "w") speech = [[x[0], x[1], "speech"] for x in vad.speech] nonspeech = [[x[0], x[1], "nonspeech"] for x in vad.nonspeech] both = sorted(speech + nonspeech) for interval in both: output_file.write("%.3f\t%.3f\t%s\n" % ( interval[0], interval[1], interval[2] )) output_file.close() print "[INFO] Creating speech and nonspeech file... done" print "[INFO] Created file %s" % output_file_path