Example #1
0
    def run_vad(self):
        """
        Determine which frames contain speech and nonspeech,
        and store the resulting boolean mask internally.
        """
        def _compute_runs(array):
            """
            Compute runs as a list of arrays,
            each containing the indices of a contiguous run.

            :param array: the data array
            :type  array: :class:`numpy.ndarray` (1D)
            :rtype: list of :class:`numpy.ndarray` (1D)
            """
            if len(array) < 1:
                return []
            return numpy.split(array, numpy.where(numpy.diff(array) != 1)[0] + 1)
        self.log(u"Creating VAD object")
        vad = VAD(rconf=self.rconf, logger=self.logger)
        self.log(u"Running VAD...")
        self.__mfcc_mask = vad.run_vad(self.__mfcc[0])
        self.__mfcc_mask_map = (numpy.where(self.__mfcc_mask))[0]
        self.log(u"Running VAD... done")
        self.log(u"Storing speech and nonspeech intervals...")
        # where( == True) already computed, reusing
        #runs = _compute_runs((numpy.where(self.__mfcc_mask))[0])
        runs = _compute_runs(self.__mfcc_mask_map)
        self.__speech_intervals = [(r[0], r[-1]) for r in runs]
        # where( == False) not already computed, computing now
        runs = _compute_runs((numpy.where(~self.__mfcc_mask))[0])
        self.__nonspeech_intervals = [(r[0], r[-1]) for r in runs]
        self.log(u"Storing speech and nonspeech intervals... done")
Example #2
0
    def run_vad(self):
        """
        Determine which frames contain speech and nonspeech,
        and store the resulting boolean mask internally.
        """
        def _compute_runs(array):
            """
            Compute runs as a list of arrays,
            each containing the indices of a contiguous run.

            :param array: the data array
            :type  array: :class:`numpy.ndarray` (1D)
            :rtype: list of :class:`numpy.ndarray` (1D)
            """
            if len(array) < 1:
                return []
            return numpy.split(array,
                               numpy.where(numpy.diff(array) != 1)[0] + 1)

        self.log(u"Creating VAD object")
        vad = VAD(rconf=self.rconf, logger=self.logger)
        self.log(u"Running VAD...")
        self.__mfcc_mask = vad.run_vad(self.__mfcc[0])
        self.__mfcc_mask_map = (numpy.where(self.__mfcc_mask))[0]
        self.log(u"Running VAD... done")
        self.log(u"Storing speech and nonspeech intervals...")
        # where( == True) already computed, reusing
        # COMMENTED runs = _compute_runs((numpy.where(self.__mfcc_mask))[0])
        runs = _compute_runs(self.__mfcc_mask_map)
        self.__speech_intervals = [(r[0], r[-1]) for r in runs]
        # where( == False) not already computed, computing now
        runs = _compute_runs((numpy.where(~self.__mfcc_mask))[0])
        self.__nonspeech_intervals = [(r[0], r[-1]) for r in runs]
        self.log(u"Storing speech and nonspeech intervals... done")
Example #3
0
 def _extract_speech(self):
     """ Extract speech intervals """
     self._log("Running VAD...")
     vad = VAD(frame_rate=self.frame_rate, logger=self.logger)
     vad.wave_len = self.audio_file.audio_length
     vad.wave_mfcc = self.audio_file.audio_mfcc
     vad.compute_vad()
     self.audio_speech = vad.speech
     self._log("Running VAD... done")
Example #4
0
    def run_vad(
        self,
        log_energy_threshold=None,
        min_nonspeech_length=None,
        extend_before=None,
        extend_after=None
    ):
        """
        Determine which frames contain speech and nonspeech,
        and store the resulting boolean mask internally.

        The four parameters might be ``None``:
        in this case, the corresponding RuntimeConfiguration values
        are applied.

        :param float log_energy_threshold: the minimum log energy threshold to consider a frame as speech
        :param int min_nonspeech_length: the minimum length, in frames, of a nonspeech interval
        :param int extend_before: extend each speech interval by this number of frames to the left (before)
        :param int extend_after: extend each speech interval by this number of frames to the right (after)
        """
        def _compute_runs(array):
            """
            Compute runs as a list of arrays,
            each containing the indices of a contiguous run.

            :param array: the data array
            :type  array: :class:`numpy.ndarray` (1D)
            :rtype: list of :class:`numpy.ndarray` (1D)
            """
            if len(array) < 1:
                return []
            return numpy.split(array, numpy.where(numpy.diff(array) != 1)[0] + 1)
        self.log(u"Creating VAD object")
        vad = VAD(rconf=self.rconf, logger=self.logger)
        self.log(u"Running VAD...")
        self.__mfcc_mask = vad.run_vad(
            wave_energy=self.__mfcc[0],
            log_energy_threshold=log_energy_threshold,
            min_nonspeech_length=min_nonspeech_length,
            extend_before=extend_before,
            extend_after=extend_after
        )
        self.__mfcc_mask_map = (numpy.where(self.__mfcc_mask))[0]
        self.log(u"Running VAD... done")
        self.log(u"Storing speech and nonspeech intervals...")
        # where( == True) already computed, reusing
        # COMMENTED runs = _compute_runs((numpy.where(self.__mfcc_mask))[0])
        runs = _compute_runs(self.__mfcc_mask_map)
        self.__speech_intervals = [(r[0], r[-1]) for r in runs]
        # where( == False) not already computed, computing now
        runs = _compute_runs((numpy.where(~self.__mfcc_mask))[0])
        self.__nonspeech_intervals = [(r[0], r[-1]) for r in runs]
        self.log(u"Storing speech and nonspeech intervals... done")
Example #5
0
    def run_vad(self,
                log_energy_threshold=None,
                min_nonspeech_length=None,
                extend_before=None,
                extend_after=None):
        """
        Determine which frames contain speech and nonspeech,
        and store the resulting boolean mask internally.

        The four parameters might be ``None``:
        in this case, the corresponding RuntimeConfiguration values
        are applied.

        :param float log_energy_threshold: the minimum log energy threshold to consider a frame as speech
        :param int min_nonspeech_length: the minimum length, in frames, of a nonspeech interval
        :param int extend_before: extend each speech interval by this number of frames to the left (before)
        :param int extend_after: extend each speech interval by this number of frames to the right (after)
        """
        def _compute_runs(array):
            """
            Compute runs as a list of arrays,
            each containing the indices of a contiguous run.

            :param array: the data array
            :type  array: :class:`numpy.ndarray` (1D)
            :rtype: list of :class:`numpy.ndarray` (1D)
            """
            if len(array) < 1:
                return []
            return numpy.split(array,
                               numpy.where(numpy.diff(array) != 1)[0] + 1)

        self.log(u"Creating VAD object")
        vad = VAD(rconf=self.rconf, logger=self.logger)
        self.log(u"Running VAD...")
        self.__mfcc_mask = vad.run_vad(
            wave_energy=self.__mfcc[0],
            log_energy_threshold=log_energy_threshold,
            min_nonspeech_length=min_nonspeech_length,
            extend_before=extend_before,
            extend_after=extend_after)
        self.__mfcc_mask_map = (numpy.where(self.__mfcc_mask))[0]
        self.log(u"Running VAD... done")
        self.log(u"Storing speech and nonspeech intervals...")
        # where( == True) already computed, reusing
        # COMMENTED runs = _compute_runs((numpy.where(self.__mfcc_mask))[0])
        runs = _compute_runs(self.__mfcc_mask_map)
        self.__speech_intervals = [(r[0], r[-1]) for r in runs]
        # where( == False) not already computed, computing now
        runs = _compute_runs((numpy.where(~self.__mfcc_mask))[0])
        self.__nonspeech_intervals = [(r[0], r[-1]) for r in runs]
        self.log(u"Storing speech and nonspeech intervals... done")
Example #6
0
    def _adjust_boundaries(self, text_map, real_wave_full_mfcc,
                           real_wave_length):
        """
        Adjust the boundaries between consecutive fragments.

        Return a pair:

        1. a success bool flag
        2. the computed interval map, that is,
           a list of triples ``[start_time, end_time, fragment_id]``

        """
        self._log("Adjusting boundaries")
        algo = self.task.configuration.adjust_boundary_algorithm
        value = None
        if algo is None:
            self._log("No adjust boundary algorithm specified: returning")
            return (True, text_map)
        elif algo == AdjustBoundaryAlgorithm.AUTO:
            self._log("Requested adjust boundary algorithm AUTO: returning")
            return (True, text_map)
        elif algo == AdjustBoundaryAlgorithm.AFTERCURRENT:
            value = self.task.configuration.adjust_boundary_aftercurrent_value
        elif algo == AdjustBoundaryAlgorithm.BEFORENEXT:
            value = self.task.configuration.adjust_boundary_beforenext_value
        elif algo == AdjustBoundaryAlgorithm.OFFSET:
            value = self.task.configuration.adjust_boundary_offset_value
        elif algo == AdjustBoundaryAlgorithm.PERCENT:
            value = self.task.configuration.adjust_boundary_percent_value
        elif algo == AdjustBoundaryAlgorithm.RATE:
            value = self.task.configuration.adjust_boundary_rate_value
        elif algo == AdjustBoundaryAlgorithm.RATEAGGRESSIVE:
            value = self.task.configuration.adjust_boundary_rate_value
        self._log(["Requested algo %s and value %s", algo, value])

        try:
            self._log("Running VAD...")
            vad = VAD(logger=self.logger)
            vad.wave_mfcc = real_wave_full_mfcc
            vad.wave_len = real_wave_length
            vad.compute_vad()
            self._log("Running VAD... done")
        except Exception as e:
            self._log("Adjusting boundaries: failed")
            self._log(["Message: %s", str(e)])
            return (False, None)

        self._log("Creating AdjustBoundaryAlgorithm object")
        adjust_boundary = AdjustBoundaryAlgorithm(algorithm=algo,
                                                  text_map=text_map,
                                                  speech=vad.speech,
                                                  nonspeech=vad.nonspeech,
                                                  value=value,
                                                  logger=self.logger)
        self._log("Adjusting boundaries...")
        adjusted_map = adjust_boundary.adjust()
        self._log("Adjusting boundaries... done")
        self._log("Adjusting boundaries: succeeded")
        return (True, adjusted_map)
Example #7
0
 def _extract_speech(self):
     """ Extract speech intervals """
     self._log("Running VAD...")
     vad = VAD(frame_rate=self.frame_rate, logger=self.logger)
     vad.wave_len = self.audio_file.audio_length
     vad.wave_mfcc = self.audio_file.audio_mfcc
     vad.compute_vad()
     self.audio_speech = vad.speech
     self._log("Running VAD... done")
Example #8
0
 def perform(self, input_file_path, speech_length, nonspeech_length):
     vad = VAD(get_abs_path(input_file_path))
     vad.compute_mfcc()
     vad.compute_vad()
     self.assertEqual(len(vad.speech), speech_length)
     self.assertEqual(len(vad.nonspeech), nonspeech_length)
Example #9
0
    def _adjust_boundaries(
            self,
            text_map,
            real_wave_full_mfcc,
            real_wave_length
        ):
        """
        Adjust the boundaries between consecutive fragments.

        Return a pair:

        1. a success bool flag
        2. the computed interval map, that is,
           a list of triples ``[start_time, end_time, fragment_id]``

        """
        self._log("Adjusting boundaries")
        algo = self.task.configuration.adjust_boundary_algorithm
        value = None
        if algo is None:
            self._log("No adjust boundary algorithm specified: returning")
            return (True, text_map)
        elif algo == AdjustBoundaryAlgorithm.AUTO:
            self._log("Requested adjust boundary algorithm AUTO: returning")
            return (True, text_map)
        elif algo == AdjustBoundaryAlgorithm.AFTERCURRENT:
            value = self.task.configuration.adjust_boundary_aftercurrent_value
        elif algo == AdjustBoundaryAlgorithm.BEFORENEXT:
            value = self.task.configuration.adjust_boundary_beforenext_value
        elif algo == AdjustBoundaryAlgorithm.OFFSET:
            value = self.task.configuration.adjust_boundary_offset_value
        elif algo == AdjustBoundaryAlgorithm.PERCENT:
            value = self.task.configuration.adjust_boundary_percent_value
        elif algo == AdjustBoundaryAlgorithm.RATE:
            value = self.task.configuration.adjust_boundary_rate_value
        elif algo == AdjustBoundaryAlgorithm.RATEAGGRESSIVE:
            value = self.task.configuration.adjust_boundary_rate_value
        self._log(["Requested algo %s and value %s", algo, value])

        try:
            self._log("Running VAD...")
            vad = VAD(logger=self.logger)
            vad.wave_mfcc = real_wave_full_mfcc
            vad.wave_len = real_wave_length
            vad.compute_vad()
            self._log("Running VAD... done")
        except Exception as e:
            self._log("Adjusting boundaries: failed")
            self._log(["Message: %s", str(e)])
            return (False, None)

        self._log("Creating AdjustBoundaryAlgorithm object")
        adjust_boundary = AdjustBoundaryAlgorithm(
            algorithm=algo,
            text_map=text_map,
            speech=vad.speech,
            nonspeech=vad.nonspeech,
            value=value,
            logger=self.logger
        )
        self._log("Adjusting boundaries...")
        adjusted_map = adjust_boundary.adjust()
        self._log("Adjusting boundaries... done")
        self._log("Adjusting boundaries: succeeded")
        return (True, adjusted_map)
Example #10
0
 def perform(self, input_file_path, speech_length, nonspeech_length):
     vad = VAD(get_abs_path(input_file_path))
     vad.compute_mfcc()
     vad.compute_vad()
     self.assertEqual(len(vad.speech), speech_length)
     self.assertEqual(len(vad.nonspeech), nonspeech_length)
Example #11
0
def main():
    """ Entry point """
    if len(sys.argv) < 4:
        usage()
        return
    audio_file_path = sys.argv[1]
    tmp_handler, tmp_file_path = tempfile.mkstemp(
        suffix=".wav",
        dir=gf.custom_tmp_dir()
    )
    mode = sys.argv[2]
    output_file_path = sys.argv[3]
    verbose = (sys.argv[-1] == "-v")

    if mode not in ["speech", "nonspeech", "both"]:
        usage()
        return

    if not gf.can_run_c_extension():
        print "[WARN] Unable to load Python C Extensions"
        print "[WARN] Running the slower pure Python code"
        print "[WARN] See the README file for directions to compile the Python C Extensions"

    logger = Logger(tee=verbose)

    print "[INFO] Converting audio file to mono..."
    converter = FFMPEGWrapper(logger=logger)
    converter.convert(audio_file_path, tmp_file_path)
    print "[INFO] Converting audio file to mono... done"

    vad = VAD(tmp_file_path, logger=logger)
    print "[INFO] Extracting MFCCs..."
    vad.compute_mfcc()
    print "[INFO] Extracting MFCCs... done"
    print "[INFO] Executing VAD..."
    vad.compute_vad()
    print "[INFO] Executing VAD... done"

    print "[INFO] Cleaning up..."
    cleanup(tmp_handler, tmp_file_path)
    print "[INFO] Cleaning up... done"

    if mode == "speech":
        print "[INFO] Creating speech file..."
        output_file = open(output_file_path, "w")
        for interval in vad.speech:
            output_file.write("%.3f\t%.3f\n" % (interval[0], interval[1]))
        output_file.close()
        print "[INFO] Creating speech file... done"

    if mode == "nonspeech":
        print "[INFO] Creating nonspeech file..."
        output_file = open(output_file_path, "w")
        for interval in vad.nonspeech:
            output_file.write("%.3f\t%.3f\n" % (interval[0], interval[1]))
        output_file.close()
        print "[INFO] Creating nonspeech file... done"

    if mode == "both":
        print "[INFO] Creating speech and nonspeech file..."
        output_file = open(output_file_path, "w")
        speech = [[x[0], x[1], "speech"] for x in vad.speech]
        nonspeech = [[x[0], x[1], "nonspeech"] for x in vad.nonspeech]
        both = sorted(speech + nonspeech)
        for interval in both:
            output_file.write("%.3f\t%.3f\t%s\n" % (
                interval[0],
                interval[1],
                interval[2]
            ))
        output_file.close()
        print "[INFO] Creating speech and nonspeech file... done"

    print "[INFO] Created file %s" % output_file_path