Example #1
0
    def _extract_mfcc(self, file_path=None, file_format=None, audio_file=None):
        """
        Extract the MFCCs from the given audio file.

        :rtype: :class:`~aeneas.audiofilemfcc.AudioFileMFCC`
        """
        audio_file_mfcc = AudioFileMFCC(file_path=file_path,
                                        file_format=file_format,
                                        audio_file=audio_file,
                                        rconf=self.rconf,
                                        logger=self.logger)
        if self.rconf.mmn:
            self.log(u"Running VAD inside _extract_mfcc...")
            audio_file_mfcc.run_vad(
                log_energy_threshold=self.rconf[
                    RuntimeConfiguration.MFCC_MASK_LOG_ENERGY_THRESHOLD],
                min_nonspeech_length=self.rconf[
                    RuntimeConfiguration.MFCC_MASK_MIN_NONSPEECH_LENGTH],
                extend_before=self.rconf[
                    RuntimeConfiguration.
                    MFCC_MASK_EXTEND_SPEECH_INTERVAL_BEFORE],
                extend_after=self.rconf[
                    RuntimeConfiguration.
                    MFCC_MASK_EXTEND_SPEECH_INTERVAL_AFTER])
            self.log(u"Running VAD inside _extract_mfcc... done")
        return audio_file_mfcc
Example #2
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 2:
            return self.print_help()
        audio_file_path = self.actual_arguments[0]
        mode = self.actual_arguments[1]
        if mode not in [u"speech", u"nonspeech", u"both"]:
            return self.print_help()
        output_file_path = None
        if len(self.actual_arguments) >= 3:
            output_file_path = self.actual_arguments[2]
        output_time = not self.has_option([u"-i", u"--index"])

        self.check_c_extensions("cmfcc")
        if not self.check_input_file(audio_file_path):
            return self.ERROR_EXIT_CODE
        if (output_file_path is not None) and (not self.check_output_file(output_file_path)):
            return self.ERROR_EXIT_CODE

        self.print_info(u"Reading audio...")
        try:
            audio_file_mfcc = AudioFileMFCC(audio_file_path, rconf=self.rconf, logger=self.logger)
        except AudioFileConverterError:
            self.print_error(u"Unable to call the ffmpeg executable '%s'" % (self.rconf[RuntimeConfiguration.FFMPEG_PATH]))
            self.print_error(u"Make sure the path to ffmpeg is correct")
            return self.ERROR_EXIT_CODE
        except (AudioFileUnsupportedFormatError, AudioFileNotInitializedError):
            self.print_error(u"Cannot read file '%s'" % (audio_file_path))
            self.print_error(u"Check that its format is supported by ffmpeg")
            return self.ERROR_EXIT_CODE
        except Exception as exc:
            self.print_error(u"An unexpected error occurred while reading the audio file:")
            self.print_error(u"%s" % exc)
            return self.ERROR_EXIT_CODE
        self.print_info(u"Reading audio... done")

        self.print_info(u"Executing VAD...")
        audio_file_mfcc.run_vad()
        self.print_info(u"Executing VAD... done")

        speech = audio_file_mfcc.intervals(speech=True, time=output_time)
        nonspeech = audio_file_mfcc.intervals(speech=False, time=output_time)
        if mode == u"speech":
            intervals = speech
        elif mode == u"nonspeech":
            intervals = nonspeech
        elif mode == u"both":
            speech = [[x[0], x[1], u"speech"] for x in speech]
            nonspeech = [[x[0], x[1], u"nonspeech"] for x in nonspeech]
            intervals = sorted(speech + nonspeech)
        intervals = [tuple(interval) for interval in intervals]
        self.write_to_file(output_file_path, intervals, output_time)

        return self.NO_ERROR_EXIT_CODE
Example #3
0
 def test_load_mfcc_matrix(self):
     mfccs = numpy.zeros((13, 250))
     audiofile = AudioFileMFCC(mfcc_matrix=mfccs)
     self.assertIsNotNone(audiofile.all_mfcc)
     self.assertAlmostEqual(audiofile.audio_length,
                            TimeValue("10.0"),
                            places=1)
Example #4
0
 def test_set_synt_wave_mfcc(self):
     af = AudioFileMFCC(self.AUDIO_FILE)
     aligner = DTWAligner(synt_wave_mfcc=af)
     self.assertIsNone(aligner.real_wave_mfcc)
     self.assertIsNotNone(aligner.synt_wave_mfcc)
     self.assertIsNone(aligner.real_wave_path)
     self.assertIsNone(aligner.synt_wave_path)
Example #5
0
 def test_load_audio_file(self):
     af = AudioFile(gf.absolute_path(self.AUDIO_FILE_WAVE, __file__))
     af.read_samples_from_file()
     audiofile = AudioFileMFCC(audio_file=af)
     self.assertIsNotNone(audiofile.all_mfcc)
     self.assertAlmostEqual(audiofile.audio_length,
                            TimeValue("53.3"),
                            places=1)  # 53.266
Example #6
0
 def load(self, path):
     audiofile = AudioFileMFCC(gf.absolute_path(path, __file__))
     self.assertIsNotNone(audiofile.all_mfcc)
     self.assertFalse(audiofile.is_reversed)
     self.assertNotEqual(audiofile.all_length, 0)
     self.assertEqual(audiofile.head_length, 0)
     self.assertEqual(audiofile.tail_length, 0)
     self.assertNotEqual(audiofile.middle_length, 0)
     self.assertNotEqual(audiofile.audio_length, 0)
     return audiofile
Example #7
0
    def _extract_mfcc(self, file_path=None, file_format=None, audio_file=None):
        """
        Extract the MFCCs from the given audio file.

        :rtype: :class:`~aeneas.audiofilemfcc.AudioFileMFCC`
        """
        return AudioFileMFCC(file_path=file_path,
                             file_format=file_format,
                             audio_file=audio_file,
                             rconf=self.rconf,
                             logger=self.logger)
Example #8
0
 def __init__(self,
              real_wave_mfcc=None,
              synt_wave_mfcc=None,
              real_wave_path=None,
              synt_wave_path=None,
              rconf=None,
              logger=None):
     if (real_wave_mfcc is not None) and (type(real_wave_mfcc)
                                          is not AudioFileMFCC):
         raise ValueError(
             u"Real wave mfcc must be None or of type AudioFileMFCC")
     if (synt_wave_mfcc is not None) and (type(synt_wave_mfcc)
                                          is not AudioFileMFCC):
         raise ValueError(
             u"Synt wave mfcc must be None or of type AudioFileMFCC")
     if (real_wave_path
             is not None) and (not gf.file_can_be_read(real_wave_path)):
         raise ValueError(u"Real wave cannot be read")
     if (synt_wave_path
             is not None) and (not gf.file_can_be_read(synt_wave_path)):
         raise ValueError(u"Synt wave cannot be read")
     if (rconf is not None) and (rconf[RuntimeConfiguration.DTW_ALGORITHM]
                                 not in DTWAlgorithm.ALLOWED_VALUES):
         raise ValueError(u"Algorithm value not allowed")
     super(DTWAligner, self).__init__(rconf=rconf, logger=logger)
     self.real_wave_mfcc = real_wave_mfcc
     self.synt_wave_mfcc = synt_wave_mfcc
     self.real_wave_path = real_wave_path
     self.synt_wave_path = synt_wave_path
     if (self.real_wave_mfcc is None) and (self.real_wave_path is not None):
         self.real_wave_mfcc = AudioFileMFCC(self.real_wave_path,
                                             rconf=self.rconf,
                                             logger=self.logger)
     if (self.synt_wave_mfcc is None) and (self.synt_wave_path is not None):
         self.synt_wave_mfcc = AudioFileMFCC(self.synt_wave_path,
                                             rconf=self.rconf,
                                             logger=self.logger)
     self.dtw = None
Example #9
0
    def _extract_mfcc(self, file_path=None, file_format=None, audio_file=None):
        """
        Extract the MFCCs from the given audio file.

        :rtype: :class:`~aeneas.audiofilemfcc.AudioFileMFCC`
        """
        audio_file_mfcc = AudioFileMFCC(
            file_path=file_path,
            file_format=file_format,
            audio_file=audio_file,
            rconf=self.rconf,
            logger=self.logger
        )
        if self.rconf.mmn:
            self.log(u"Running VAD inside _extract_mfcc...")
            audio_file_mfcc.run_vad(
                log_energy_threshold=self.rconf[RuntimeConfiguration.MFCC_MASK_LOG_ENERGY_THRESHOLD],
                min_nonspeech_length=self.rconf[RuntimeConfiguration.MFCC_MASK_MIN_NONSPEECH_LENGTH],
                extend_before=self.rconf[RuntimeConfiguration.MFCC_MASK_EXTEND_SPEECH_INTERVAL_BEFORE],
                extend_after=self.rconf[RuntimeConfiguration.MFCC_MASK_EXTEND_SPEECH_INTERVAL_AFTER]
            )
            self.log(u"Running VAD inside _extract_mfcc... done")
        return audio_file_mfcc
Example #10
0
 def test_compute_acm_real_mfcc(self):
     af = AudioFileMFCC(self.AUDIO_FILE)
     aligner = DTWAligner(real_wave_mfcc=af)
     with self.assertRaises(DTWAlignerNotInitialized):
         aligner.compute_accumulated_cost_matrix()
Example #11
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 2:
            return self.print_help()
        audio_file_path = self.actual_arguments[0]
        mode = self.actual_arguments[1]
        if mode not in [u"speech", u"nonspeech", u"both"]:
            return self.print_help()
        output_file_path = None
        if len(self.actual_arguments) >= 3:
            output_file_path = self.actual_arguments[2]
        output_time = not self.has_option([u"-i", u"--index"])

        self.check_c_extensions("cmfcc")
        if not self.check_input_file(audio_file_path):
            return self.ERROR_EXIT_CODE
        if (output_file_path is not None) and (
                not self.check_output_file(output_file_path)):
            return self.ERROR_EXIT_CODE

        self.print_info(u"Reading audio...")
        try:
            audio_file_mfcc = AudioFileMFCC(audio_file_path,
                                            rconf=self.rconf,
                                            logger=self.logger)
        except AudioFileConverterError:
            self.print_error(u"Unable to call the ffmpeg executable '%s'" %
                             (self.rconf[RuntimeConfiguration.FFMPEG_PATH]))
            self.print_error(u"Make sure the path to ffmpeg is correct")
            return self.ERROR_EXIT_CODE
        except (AudioFileUnsupportedFormatError, AudioFileNotInitializedError):
            self.print_error(u"Cannot read file '%s'" % (audio_file_path))
            self.print_error(u"Check that its format is supported by ffmpeg")
            return self.ERROR_EXIT_CODE
        except Exception as exc:
            self.print_error(
                u"An unexpected error occurred while reading the audio file:")
            self.print_error(u"%s" % exc)
            return self.ERROR_EXIT_CODE
        self.print_info(u"Reading audio... done")

        self.print_info(u"Executing VAD...")
        audio_file_mfcc.run_vad()
        self.print_info(u"Executing VAD... done")

        speech = audio_file_mfcc.intervals(speech=True, time=output_time)
        nonspeech = audio_file_mfcc.intervals(speech=False, time=output_time)
        if mode == u"speech":
            intervals = speech
        elif mode == u"nonspeech":
            intervals = nonspeech
        elif mode == u"both":
            speech = [[x[0], x[1], u"speech"] for x in speech]
            nonspeech = [[x[0], x[1], u"nonspeech"] for x in nonspeech]
            intervals = sorted(speech + nonspeech)
        intervals = [tuple(interval) for interval in intervals]
        self.write_to_file(output_file_path, intervals, output_time)

        return self.NO_ERROR_EXIT_CODE
Example #12
0
 def test_compute_path_synt_mfcc(self):
     af = AudioFileMFCC(self.AUDIO_FILE)
     aligner = DTWAligner(synt_wave_mfcc=af)
     with self.assertRaises(DTWAlignerNotInitialized):
         aligner.compute_path()
Example #13
0
def build_sync_map(
    text_paths, audio_paths, tmp_dir,
    sync_map_text_path_prefix, sync_map_audio_path_prefix,
    skip_penalty, radius
):
    """
    This is an algorithm for building a sync map.
    It synthesizes text and then aligns synthesized audio with the recorded audio
    using a variation of the DTW (Dynamic Time Warping) algorithm.
    
    The main features of this algorithm are:
    1) It can handle structural differences in the beginning and in the end of files.
    2) It finds an approximation to an optimal warping path in linear time and space using FastDTW approach.

    Note that while the algorithm does not require one-to-one correspondance
    between text and audio files (i.e. the splitting can be done differently),
    the quality of the result is sensitive to the choice of skip_penalty and radius parameters,
    so it is recommended to have such a correspondance.

    Alignment details:
    Synthesized and recorded audio are represented as sequences of MFCC frames.
    These sequences are aligned using variation of the DTW algorithm.
    In contrast to the classic DTW, this algorithms can be used
    to align sequences with structural differences in the beginning or in the end.
    
    Steps to build a sync map:
    1) Synthesize text file and produce a list of anchors.
    Each anchor represents the start of the corresponding text fragment in a synthesized audio.
    2) Get sequences of MFCC frames of synthesized and recorded audio.
    3) Get their warping path by calling the alignment algorithm.
    4) Check whether the extra content is found, calculate mapping boundaries.
    5) Map anchors inside the boundaries to the recorded MFCC sequence using warping path from step 3.
    6) Start all over again considering:
    If there is an extra content in the end of synthesized sequence, align it with the next audio file.
    If there is an extra content in the end of recorded sequence, align it with the next text file.
    If both sequences have extra content in the end, align text tail with the next audio file.
    If none of the above, align next text and audio files.
    """

    synthesizer = Synthesizer()
    parse_parameters = {'is_text_unparsed_id_regex': 'f[0-9]+'}
    
    sync_map = {}
    process_next_text = True
    process_next_audio = True

    while True:
        if process_next_text:
            try:
                text_path = next(text_paths)
            except StopIteration:
                break

            text_name = get_name_from_path(text_path)
            output_text_name = os.path.join(sync_map_text_path_prefix, text_name)
            textfile = TextFile(text_path, file_format=TextFileFormat.UNPARSED, parameters=parse_parameters)
            textfile.set_language(Language.ENG)
            text_wav_path = os.path.join(tmp_dir, f'{drop_extension(text_name)}_text.wav')
            sync_map[output_text_name] = {}

            # Produce synthesized audio, get anchors
            anchors,_,_ = synthesizer.synthesize(textfile, text_wav_path)
            
            # Get fragments, convert anchors timings to the frames indicies
            fragments = [a[1] for a in anchors]
            anchors = np.array([int(a[0] / TimeValue('0.040')) for a in anchors])

            # MFCC frames sequence memory layout is a n x l 2D array,
            # where n - number of frames and l - number of MFFCs
            # i.e it is c-contiguous, but after dropping the first coefficient it siezes to be c-contiguous.
            # Should decide whether to make a copy or to work around the first coefficient.
            text_mfcc_sequence = np.ascontiguousarray(
                AudioFileMFCC(text_wav_path).all_mfcc.T[:, 1:]
            )
            
        if process_next_audio:
            try:
                audio_path = next(audio_paths)
            except StopIteration:
                break

            audio_name = get_name_from_path(audio_path)
            output_audio_name = os.path.join(sync_map_audio_path_prefix, audio_name)
            audio_wav_path = os.path.join(tmp_dir, f'{drop_extension(audio_name)}_audio.wav')
            subprocess.run(['ffmpeg', '-n', '-i', audio_path, audio_wav_path])

            audio_mfcc_sequence = np.ascontiguousarray(
                AudioFileMFCC(audio_wav_path).all_mfcc.T[:, 1:]
            )
            
            # Keep track to calculate frames timings
            audio_start_frame = 0
        
        n = len(text_mfcc_sequence)
        m = len(audio_mfcc_sequence)

        _, path = c_FastDTWBD(text_mfcc_sequence, audio_mfcc_sequence, skip_penalty, radius=radius)
        
        if len(path) == 0:
            print(
                f'No match between {text_name} and {audio_name}. '
                f'Alignment is terminated. '
                f'Adjust skip_penalty or input files.'
            )
            return {}
        
        # Project path to the text and audio sequences
        text_path_frames = path[:,0]
        audio_path_frames = path[:,1]
        
        last_matched_audio_frame = audio_path_frames[-1]

        # Find first and last matched frames
        first_matched_text_frame = text_path_frames[0]
        last_matched_text_frame = text_path_frames[-1]

        # Map only those fragments that intersect matched frames
        anchors_boundary_indices = np.searchsorted(
            anchors, [first_matched_text_frame, last_matched_text_frame]
        )
        map_anchors_from = max(anchors_boundary_indices[0] - 1, 0)
        map_anchors_to = anchors_boundary_indices[1]
        anchors_to_map = anchors[map_anchors_from:map_anchors_to]
        fragments_to_map = fragments[map_anchors_from:map_anchors_to]

        # Get anchors indicies in the path projection to the text sequence
        text_path_anchor_indices = np.searchsorted(text_path_frames, anchors_to_map)
        
        # Get anchors' frames in audio sequence, calculate their timings
        anchors_matched_frames = audio_path_frames[text_path_anchor_indices]
        timings = (np.append(anchors_matched_frames, audio_path_frames[-1]) + audio_start_frame) * 0.040
        
        # Map fragment_ids to timings, update mapping of the current text file
        fragment_map = {
            f: {
                'audio_file': output_audio_name,
                'begin_time': time_to_str(bt),
                'end_time': time_to_str(et)
            }
            for f, bt, et in zip(fragments_to_map, timings[:-1], timings[1:])
        }

        sync_map[output_text_name].update(fragment_map)
        
        # Decide whether to process next file or to align the tail of the current one

        if map_anchors_to == len(anchors):
            # Process next text if no fragments are left
            process_next_text = True
        else:
            # Otherwise align tail of the current text
            process_next_text = False
            text_mfcc_sequence = text_mfcc_sequence[last_matched_text_frame:]
            fragments = fragments[map_anchors_to:]
            anchors = anchors[map_anchors_to:] - last_matched_text_frame
            
        if last_matched_audio_frame == m - 1 or not process_next_text:
            # Process next audio if there are no unmatched audio frames in the tail
            # or there are more text fragments to map, i.e.
            # we choose to process next audio if we cannot decide.
            # This strategy is correct if there are no extra fragments in the end.
            process_next_audio = True
        else:
            # Otherwise align tail of the current audio
            process_next_audio = False
            audio_mfcc_sequence = audio_mfcc_sequence[last_matched_audio_frame:]
            audio_start_frame += last_matched_audio_frame
    
    return sync_map
Example #14
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 2:
            return self.print_help()
        input_file_path = self.actual_arguments[0]
        output_file_path = self.actual_arguments[1]

        output_text_format = self.has_option_with_value(u"--format")
        if output_text_format is None:
            output_text_format = u"%.18e"
        output_binary = self.has_option([u"-b", u"--binary"])
        output_npz = self.has_option([u"-z", u"--npz"])
        output_npy = self.has_option([u"-n", u"--npy"])
        delete_first = self.has_option([u"-d", u"--delete-first"])
        transpose = self.has_option([u"-t", u"--transpose"])

        self.check_c_extensions("cmfcc")
        if not self.check_input_file(input_file_path):
            return self.ERROR_EXIT_CODE
        if not self.check_output_file(output_file_path):
            return self.ERROR_EXIT_CODE

        try:
            mfccs = AudioFileMFCC(input_file_path, rconf=self.rconf, logger=self.logger).all_mfcc
            if delete_first:
                mfccs = mfccs[1:, :]
            if transpose:
                mfccs = mfccs.transpose()
            if output_binary:
                # save as a raw C float64 binary file
                mapped = numpy.memmap(output_file_path, dtype="float64", mode="w+", shape=mfccs.shape)
                mapped[:] = mfccs[:]
                mapped.flush()
                del mapped
            elif output_npz:
                # save as a .npz compressed binary file
                with io.open(output_file_path, "wb") as output_file:
                    numpy.savez(output_file, mfccs)
            elif output_npy:
                # save as a .npy binary file
                with io.open(output_file_path, "wb") as output_file:
                    numpy.save(output_file, mfccs)
            else:
                # save as a text file
                # NOTE: in Python 2, passing the fmt value a Unicode string crashes NumPy
                #       hence, converting back to bytes, which works in Python 3 too
                numpy.savetxt(output_file_path, mfccs, fmt=gf.safe_bytes(output_text_format))
            self.print_info(u"MFCCs shape: %d %d" % (mfccs.shape))
            self.print_success(u"MFCCs saved to '%s'" % (output_file_path))
            return self.NO_ERROR_EXIT_CODE
        except AudioFileConverterError:
            self.print_error(u"Unable to call the ffmpeg executable '%s'" % (self.rconf[RuntimeConfiguration.FFMPEG_PATH]))
            self.print_error(u"Make sure the path to ffmpeg is correct")
        except (AudioFileUnsupportedFormatError, AudioFileNotInitializedError):
            self.print_error(u"Cannot read file '%s'" % (input_file_path))
            self.print_error(u"Check that its format is supported by ffmpeg")
        except OSError:
            self.print_error(u"Cannot write file '%s'" % (output_file_path))

        return self.ERROR_EXIT_CODE
Example #15
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 2:
            return self.print_help()
        input_file_path = self.actual_arguments[0]
        output_file_path = self.actual_arguments[1]

        output_text_format = self.has_option_with_value(u"--format")
        if output_text_format is None:
            output_text_format = u"%.18e"
        output_binary = self.has_option([u"-b", u"--binary"])
        output_npz = self.has_option([u"-z", u"--npz"])
        output_npy = self.has_option([u"-n", u"--npy"])
        delete_first = self.has_option([u"-d", u"--delete-first"])
        transpose = self.has_option([u"-t", u"--transpose"])

        self.check_c_extensions("cmfcc")
        if not self.check_input_file(input_file_path):
            return self.ERROR_EXIT_CODE
        if not self.check_output_file(output_file_path):
            return self.ERROR_EXIT_CODE

        try:
            mfccs = AudioFileMFCC(input_file_path,
                                  rconf=self.rconf,
                                  logger=self.logger).all_mfcc
            if delete_first:
                mfccs = mfccs[1:, :]
            if transpose:
                mfccs = mfccs.transpose()
            if output_binary:
                # save as a raw C float64 binary file
                mapped = numpy.memmap(output_file_path,
                                      dtype="float64",
                                      mode="w+",
                                      shape=mfccs.shape)
                mapped[:] = mfccs[:]
                mapped.flush()
                del mapped
            elif output_npz:
                # save as a .npz compressed binary file
                with io.open(output_file_path, "wb") as output_file:
                    numpy.savez(output_file, mfccs)
            elif output_npy:
                # save as a .npy binary file
                with io.open(output_file_path, "wb") as output_file:
                    numpy.save(output_file, mfccs)
            else:
                # save as a text file
                # NOTE: in Python 2, passing the fmt value a Unicode string crashes NumPy
                #       hence, converting back to bytes, which works in Python 3 too
                numpy.savetxt(output_file_path,
                              mfccs,
                              fmt=gf.safe_bytes(output_text_format))
            self.print_info(u"MFCCs shape: %d %d" % (mfccs.shape))
            self.print_success(u"MFCCs saved to '%s'" % (output_file_path))
            return self.NO_ERROR_EXIT_CODE
        except AudioFileConverterError:
            self.print_error(u"Unable to call the ffmpeg executable '%s'" %
                             (self.rconf[RuntimeConfiguration.FFMPEG_PATH]))
            self.print_error(u"Make sure the path to ffmpeg is correct")
        except (AudioFileUnsupportedFormatError, AudioFileNotInitializedError):
            self.print_error(u"Cannot read file '%s'" % (input_file_path))
            self.print_error(u"Check that its format is supported by ffmpeg")
        except OSError:
            self.print_error(u"Cannot write file '%s'" % (output_file_path))

        return self.ERROR_EXIT_CODE
Example #16
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 4:
            return self.print_help()
        text_format = gf.safe_unicode(self.actual_arguments[0])
        if text_format == u"list":
            text = gf.safe_unicode(self.actual_arguments[1])
        elif text_format in TextFileFormat.ALLOWED_VALUES:
            text = self.actual_arguments[1]
            if not self.check_input_file(text):
                return self.ERROR_EXIT_CODE
        else:
            return self.print_help()

        l1_id_regex = self.has_option_with_value(u"--l1-id-regex")
        l2_id_regex = self.has_option_with_value(u"--l2-id-regex")
        l3_id_regex = self.has_option_with_value(u"--l3-id-regex")
        id_regex = self.has_option_with_value(u"--id-regex")
        class_regex = self.has_option_with_value(u"--class-regex")
        sort = self.has_option_with_value(u"--sort")
        parameters = {
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX : l1_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX : l2_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX : l3_id_regex,
            gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX : id_regex,
            gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX : class_regex,
            gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT : sort
        }
        if (text_format == TextFileFormat.MUNPARSED) and ((l1_id_regex is None) or (l2_id_regex is None) or (l3_id_regex is None)):
            self.print_error(u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format")
            return self.ERROR_EXIT_CODE
        if (text_format == TextFileFormat.UNPARSED) and (id_regex is None) and (class_regex is None):
            self.print_error(u"You must specify --id-regex and/or --class-regex for unparsed format")
            return self.ERROR_EXIT_CODE

        language = gf.safe_unicode(self.actual_arguments[2])

        audio_file_path = self.actual_arguments[3]
        if not self.check_input_file(audio_file_path):
            return self.ERROR_EXIT_CODE

        text_file = self.get_text_file(text_format, text, parameters)
        if text_file is None:
            self.print_error(u"Unable to build a TextFile from the given parameters")
            return self.ERROR_EXIT_CODE
        elif len(text_file) == 0:
            self.print_error(u"No text fragments found")
            return self.ERROR_EXIT_CODE
        text_file.set_language(language)
        self.print_info(u"Read input text with %d fragments" % (len(text_file)))

        self.print_info(u"Reading audio...")
        try:
            audio_file_mfcc = AudioFileMFCC(audio_file_path, rconf=self.rconf, logger=self.logger)
        except AudioFileConverterError:
            self.print_error(u"Unable to call the ffmpeg executable '%s'" % (self.rconf[RuntimeConfiguration.FFMPEG_PATH]))
            self.print_error(u"Make sure the path to ffmpeg is correct")
            return self.ERROR_EXIT_CODE
        except (AudioFileUnsupportedFormatError, AudioFileNotInitializedError):
            self.print_error(u"Cannot read file '%s'" % (audio_file_path))
            self.print_error(u"Check that its format is supported by ffmpeg")
            return self.ERROR_EXIT_CODE
        except Exception as exc:
            self.print_error(u"An unexpected error occurred while reading the audio file:")
            self.print_error(u"%s" % exc)
            return self.ERROR_EXIT_CODE
        self.print_info(u"Reading audio... done")

        self.print_info(u"Running VAD...")
        audio_file_mfcc.run_vad()
        self.print_info(u"Running VAD... done")

        min_head = gf.safe_float(self.has_option_with_value(u"--min-head"), None)
        max_head = gf.safe_float(self.has_option_with_value(u"--max-head"), None)
        min_tail = gf.safe_float(self.has_option_with_value(u"--min-tail"), None)
        max_tail = gf.safe_float(self.has_option_with_value(u"--max-tail"), None)

        self.print_info(u"Detecting audio interval...")
        start_detector = SD(audio_file_mfcc, text_file, rconf=self.rconf, logger=self.logger)
        start, end = start_detector.detect_interval(min_head, max_head, min_tail, max_tail)
        self.print_info(u"Detecting audio interval... done")

        self.print_result(audio_file_mfcc.audio_length, start, end)
        return self.NO_ERROR_EXIT_CODE
Example #17
0
 def perform(self, input_file_path, speech_length, nonspeech_length):
     audiofile = AudioFileMFCC(gf.absolute_path(input_file_path, __file__))
     audiofile.run_vad()
     self.assertEqual(len(audiofile.intervals(speech=True)), speech_length)
     self.assertEqual(len(audiofile.intervals(speech=False)),
                      nonspeech_length)
Example #18
0
    def _time_and_combine(self, text_file):
        """
        Combine original audio clips into a single WAV file.

        Return a tuple consisting of:

        1. the handler of the generated audio file
        2. the path of the generated audio file
        3. the list of anchors, that is, a list of floats
           each representing the start time of the corresponding
           text fragment in the generated wave file
           ``[start_1, start_2, ..., start_n]``
        4. a tuple describing the format of the audio file

        :param text_file: the text with audio clips to be timed/combined
        :type  text_file: :class:`~aeneas.textfile.TextFile`
        :rtype: tuple (handler, string, list)
        """
        import subprocess

        # Concatenate all clips into a single, temporary file
        handler, path = gf.tmp_file(
            suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
        cmd = "ffmpeg -y -f concat -i {} -c copy {}".format(
            text_file.file_path, path)
        subprocess.call(cmd, shell=True)

        audio_format = ('pcm_s161e', 1, 2)

        # Build "synt" anchor times
        anchor_time, anchors = TimeValue('0.0'), []
        for fragment in text_file.fragments:
            audio_path = 'output/sample/{}'.format(fragment.text.split("'")[1])
            audio_file = AudioFileMFCC(file_path=audio_path,
                                       file_format=audio_format)
            # TODO: Investigate faster ways to get the audio_length
            # cmd = 'ffprobe -i {} -show_entries format=duration -v quiet -of csv="p=0"'.format(audio_path)
            # subprocess.call(cmd, shell=True)
            # # should become... (to get response)
            # cmds = ['ffprobe', '-i', audio_path, '-show_entries', 'format=duration',
            #         '-v', 'quiet', '-of', 'csv="p=0"']
            # p = subprocess.Popen(cmds, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            # output, err = p.communicate()
            # audio_length = TimeValue(output)
            anchors.append([anchor_time, fragment.identifier, audio_path])
            anchor_time += audio_file.audio_length

        # [
        #     [TimeValue('0.0'), u'f000001', 'output/sample/audio01.wav'],
        #     [TimeValue('0.339625'), u'f000002', 'output/sample/audio02.wav'],
        #     [TimeValue('3.5526875'), u'f000003', 'output/sample/audio03.wav'],
        #     [TimeValue('6.6874375'), u'f000004', 'output/sample/audio04.wav'],
        #     [TimeValue('9.5609375'), u'f000005', 'output/sample/audio05.wav'],
        #     [TimeValue('12.4344375'), u'f000006', 'output/sample/audio06.wav'],
        #     [TimeValue('16.1961250'), u'f000007', 'output/sample/audio07.wav'],
        #     [TimeValue('19.9578125'), u'f000008', 'output/sample/audio08.wav'],
        #     [TimeValue('23.0925625'), u'f000009', 'output/sample/audio09.wav'],
        #     [TimeValue('28.0297500'), u'f000010', 'output/sample/audio10.wav'],
        #     [TimeValue('31.1645000'), u'f000011', 'output/sample/audio11.wav'],
        #     [TimeValue('33.5678125'), u'f000012', 'output/sample/audio12.wav'],
        #     [TimeValue('37.0943750'), u'f000013', 'output/sample/audio13.wav'],
        #     [TimeValue('40.2030000'), u'f000014', 'output/sample/audio14.wav'],
        #     [TimeValue('43.8601875'), u'f000015', 'output/sample/audio15.wav']
        # ]

        # import pdb; pdb.set_trace()

        return (handler, path, anchors, audio_format)
Example #19
0
    def _detect(self, min_length, max_length, tail=False):
        """
        Detect the head or tail within ``min_length`` and ``max_length`` duration.

        If detecting the tail, the real wave MFCC and the query are reversed
        so that the tail detection problem reduces to a head detection problem.

        Return the duration of the head or tail, in seconds.

        :param min_length: estimated minimum length
        :type  min_length: :class:`~aeneas.exacttiming.TimeValue`
        :param max_length: estimated maximum length
        :type  max_length: :class:`~aeneas.exacttiming.TimeValue`
        :rtype: :class:`~aeneas.exacttiming.TimeValue`
        :raises: TypeError: if one of the parameters is not ``None`` or a number
        :raises: ValueError: if one of the parameters is negative
        """
        def _sanitize(value, default, name):
            if value is None:
                value = default
            try:
                value = TimeValue(value)
            except (TypeError, ValueError, InvalidOperation) as exc:
                self.log_exc(u"The value of %s is not a number" % (name), exc, True, TypeError)
            if value < 0:
                self.log_exc(u"The value of %s is negative" % (name), None, True, ValueError)
            return value

        min_length = _sanitize(min_length, self.MIN_LENGTH, "min_length")
        max_length = _sanitize(max_length, self.MAX_LENGTH, "max_length")
        mws = self.rconf.mws
        min_length_frames = int(min_length / mws)
        max_length_frames = int(max_length / mws)
        self.log([u"MFCC window shift s:     %.3f", mws])
        self.log([u"Min start length s:      %.3f", min_length])
        self.log([u"Min start length frames: %d", min_length_frames])
        self.log([u"Max start length s:      %.3f", max_length])
        self.log([u"Max start length frames: %d", max_length_frames])
        self.log([u"Tail?:                   %s", str(tail)])

        self.log(u"Synthesizing query...")
        synt_duration = max_length * self.QUERY_FACTOR
        self.log([u"Synthesizing at least %.3f seconds", synt_duration])
        tmp_handler, tmp_file_path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
        synt = Synthesizer(rconf=self.rconf, logger=self.logger)
        anchors, total_time, synthesized_chars = synt.synthesize(
            self.text_file,
            tmp_file_path,
            quit_after=synt_duration,
            backwards=tail
        )
        self.log(u"Synthesizing query... done")

        self.log(u"Extracting MFCCs for query...")
        query_mfcc = AudioFileMFCC(tmp_file_path, rconf=self.rconf, logger=self.logger)
        self.log(u"Extracting MFCCs for query... done")

        self.log(u"Cleaning up...")
        gf.delete_file(tmp_handler, tmp_file_path)
        self.log(u"Cleaning up... done")

        search_window = max_length * self.AUDIO_FACTOR
        search_window_end = min(int(search_window / mws), self.real_wave_mfcc.all_length)
        self.log([u"Query MFCC length (frames): %d", query_mfcc.all_length])
        self.log([u"Real MFCC length (frames):  %d", self.real_wave_mfcc.all_length])
        self.log([u"Search window end (s):      %.3f", search_window])
        self.log([u"Search window end (frames): %d", search_window_end])

        if tail:
            self.log(u"Tail => reversing real_wave_mfcc and query_mfcc")
            self.real_wave_mfcc.reverse()
            query_mfcc.reverse()

        # NOTE: VAD will be run here, if not done before
        speech_intervals = self.real_wave_mfcc.intervals(speech=True, time=False)
        if len(speech_intervals) < 1:
            self.log(u"No speech intervals, hence no start found")
            if tail:
                self.real_wave_mfcc.reverse()
            return TimeValue("0.000")

        # generate a list of begin indices
        search_end = None
        candidates_begin = []
        for interval in speech_intervals:
            if (interval[0] >= min_length_frames) and (interval[0] <= max_length_frames):
                candidates_begin.append(interval[0])
            search_end = interval[1]
            if search_end >= search_window_end:
                break

        # for each begin index, compute the acm cost
        # to match the query
        # note that we take the min over the last column of the acm
        # meaning that we allow to match the entire query wave
        # against a portion of the real wave
        candidates = []
        for candidate_begin in candidates_begin:
            self.log([u"Candidate interval starting at %d == %.3f", candidate_begin, candidate_begin * mws])
            try:
                rwm = AudioFileMFCC(
                    mfcc_matrix=self.real_wave_mfcc.all_mfcc[:, candidate_begin:search_end],
                    rconf=self.rconf,
                    logger=self.logger
                )
                dtw = DTWAligner(
                    real_wave_mfcc=rwm,
                    synt_wave_mfcc=query_mfcc,
                    rconf=self.rconf,
                    logger=self.logger
                )
                acm = dtw.compute_accumulated_cost_matrix()
                last_column = acm[:, -1]
                min_value = numpy.min(last_column)
                min_index = numpy.argmin(last_column)
                self.log([u"Candidate interval: %d %d == %.3f %.3f", candidate_begin, search_end, candidate_begin * mws, search_end * mws])
                self.log([u"  Min value: %.6f", min_value])
                self.log([u"  Min index: %d == %.3f", min_index, min_index * mws])
                candidates.append((min_value, candidate_begin, min_index))
            except Exception as exc:
                self.log_exc(u"An unexpected error occurred while running _detect", exc, False, None)

        # reverse again the real wave
        if tail:
            self.log(u"Tail => reversing real_wave_mfcc again")
            self.real_wave_mfcc.reverse()

        # return
        if len(candidates) < 1:
            self.log(u"No candidates found")
            return TimeValue("0.000")
        self.log(u"Candidates:")
        for candidate in candidates:
            self.log([u"  Value: %.6f Begin Time: %.3f Min Index: %d", candidate[0], candidate[1] * mws, candidate[2]])
        best = sorted(candidates)[0][1]
        self.log([u"Best candidate: %d == %.3f", best, best * mws])
        return best * mws
Example #20
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 4:
            return self.print_help()
        text_format = gf.safe_unicode(self.actual_arguments[0])
        if text_format == u"list":
            text = gf.safe_unicode(self.actual_arguments[1])
        elif text_format in TextFileFormat.ALLOWED_VALUES:
            text = self.actual_arguments[1]
            if not self.check_input_file(text):
                return self.ERROR_EXIT_CODE
        else:
            return self.print_help()

        l1_id_regex = self.has_option_with_value(u"--l1-id-regex")
        l2_id_regex = self.has_option_with_value(u"--l2-id-regex")
        l3_id_regex = self.has_option_with_value(u"--l3-id-regex")
        id_regex = self.has_option_with_value(u"--id-regex")
        class_regex = self.has_option_with_value(u"--class-regex")
        sort = self.has_option_with_value(u"--sort")
        parameters = {
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX: l1_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX: l2_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX: l3_id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX: class_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: sort,
        }
        if (text_format == TextFileFormat.MUNPARSED) and (
            (l1_id_regex is None) or (l2_id_regex is None) or
            (l3_id_regex is None)):
            self.print_error(
                u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format"
            )
            return self.ERROR_EXIT_CODE
        if (text_format == TextFileFormat.UNPARSED) and (
                id_regex is None) and (class_regex is None):
            self.print_error(
                u"You must specify --id-regex and/or --class-regex for unparsed format"
            )
            return self.ERROR_EXIT_CODE

        language = gf.safe_unicode(self.actual_arguments[2])

        audio_file_path = self.actual_arguments[3]
        if not self.check_input_file(audio_file_path):
            return self.ERROR_EXIT_CODE

        text_file = self.get_text_file(text_format, text, parameters)
        if text_file is None:
            self.print_error(
                u"Unable to build a TextFile from the given parameters")
            return self.ERROR_EXIT_CODE
        elif len(text_file) == 0:
            self.print_error(u"No text fragments found")
            return self.ERROR_EXIT_CODE
        text_file.set_language(language)
        self.print_info(u"Read input text with %d fragments" %
                        (len(text_file)))

        self.print_info(u"Reading audio...")
        try:
            audio_file_mfcc = AudioFileMFCC(audio_file_path,
                                            rconf=self.rconf,
                                            logger=self.logger)
        except AudioFileConverterError:
            self.print_error(u"Unable to call the ffmpeg executable '%s'" %
                             (self.rconf[RuntimeConfiguration.FFMPEG_PATH]))
            self.print_error(u"Make sure the path to ffmpeg is correct")
            return self.ERROR_EXIT_CODE
        except (AudioFileUnsupportedFormatError, AudioFileNotInitializedError):
            self.print_error(u"Cannot read file '%s'" % (audio_file_path))
            self.print_error(u"Check that its format is supported by ffmpeg")
            return self.ERROR_EXIT_CODE
        except Exception as exc:
            self.print_error(
                u"An unexpected error occurred while reading the audio file:")
            self.print_error(u"%s" % exc)
            return self.ERROR_EXIT_CODE
        self.print_info(u"Reading audio... done")

        self.print_info(u"Running VAD...")
        audio_file_mfcc.run_vad()
        self.print_info(u"Running VAD... done")

        min_head = gf.safe_float(self.has_option_with_value(u"--min-head"),
                                 None)
        max_head = gf.safe_float(self.has_option_with_value(u"--max-head"),
                                 None)
        min_tail = gf.safe_float(self.has_option_with_value(u"--min-tail"),
                                 None)
        max_tail = gf.safe_float(self.has_option_with_value(u"--max-tail"),
                                 None)

        self.print_info(u"Detecting audio interval...")
        start_detector = SD(audio_file_mfcc,
                            text_file,
                            rconf=self.rconf,
                            logger=self.logger)
        start, end = start_detector.detect_interval(min_head, max_head,
                                                    min_tail, max_tail)
        self.print_info(u"Detecting audio interval... done")

        self.print_result(audio_file_mfcc.audio_length, start, end)
        return self.NO_ERROR_EXIT_CODE
Example #21
0
 def load(self):
     audio_file_mfcc = AudioFileMFCC(self.AUDIO_FILE)
     text_file = TextFile(self.TEXT_FILE, file_format=TextFileFormat.PLAIN)
     text_file.set_language(Language.ENG)
     return SD(audio_file_mfcc, text_file)
Example #22
0
    def _detect(self, min_length, max_length, tail=False):
        """
        Detect the head or tail within ``min_length`` and ``max_length`` duration.

        If detecting the tail, the real wave MFCC and the query are reversed
        so that the tail detection problem reduces to a head detection problem.

        Return the duration of the head or tail, in seconds.

        :param min_length: estimated minimum length
        :type  min_length: :class:`~aeneas.timevalue.TimeValue`
        :param max_length: estimated maximum length
        :type  max_length: :class:`~aeneas.timevalue.TimeValue`
        :rtype: :class:`~aeneas.timevalue.TimeValue`
        :raises: TypeError: if one of the parameters is not ``None`` or a number
        :raises: ValueError: if one of the parameters is negative
        """
        def _sanitize(value, default, name):
            if value is None:
                value = default
            try:
                value = TimeValue(value)
            except (TypeError, ValueError, InvalidOperation) as exc:
                self.log_exc(u"The value of %s is not a number" % (name), exc, True, TypeError)
            if value < 0:
                self.log_exc(u"The value of %s is negative" % (name), None, True, ValueError)
            return value

        min_length = _sanitize(min_length, self.MIN_LENGTH, "min_length")
        max_length = _sanitize(max_length, self.MAX_LENGTH, "max_length")
        mws = self.rconf.mws
        min_length_frames = int(min_length / mws)
        max_length_frames = int(max_length / mws)
        self.log([u"MFCC window shift s:     %.3f", mws])
        self.log([u"Min start length s:      %.3f", min_length])
        self.log([u"Min start length frames: %d", min_length_frames])
        self.log([u"Max start length s:      %.3f", max_length])
        self.log([u"Max start length frames: %d", max_length_frames])
        self.log([u"Tail?:                   %s", str(tail)])

        self.log(u"Synthesizing query...")
        synt_duration = max_length * self.QUERY_FACTOR
        self.log([u"Synthesizing at least %.3f seconds", synt_duration])
        tmp_handler, tmp_file_path = gf.tmp_file(suffix=u".wav", root=self.rconf[RuntimeConfiguration.TMP_PATH])
        synt = Synthesizer(rconf=self.rconf, logger=self.logger)
        anchors, total_time, synthesized_chars = synt.synthesize(
            self.text_file,
            tmp_file_path,
            quit_after=synt_duration,
            backwards=tail
        )
        self.log(u"Synthesizing query... done")

        self.log(u"Extracting MFCCs for query...")
        query_mfcc = AudioFileMFCC(tmp_file_path, rconf=self.rconf, logger=self.logger)
        self.log(u"Extracting MFCCs for query... done")

        self.log(u"Cleaning up...")
        gf.delete_file(tmp_handler, tmp_file_path)
        self.log(u"Cleaning up... done")

        search_window = max_length * self.AUDIO_FACTOR
        search_window_end = min(int(search_window / mws), self.real_wave_mfcc.all_length)
        self.log([u"Query MFCC length (frames): %d", query_mfcc.all_length])
        self.log([u"Real MFCC length (frames):  %d", self.real_wave_mfcc.all_length])
        self.log([u"Search window end (s):      %.3f", search_window])
        self.log([u"Search window end (frames): %d", search_window_end])

        if tail:
            self.log(u"Tail => reversing real_wave_mfcc and query_mfcc")
            self.real_wave_mfcc.reverse()
            query_mfcc.reverse()

        # NOTE: VAD will be run here, if not done before
        speech_intervals = self.real_wave_mfcc.intervals(speech=True, time=False)
        if len(speech_intervals) < 1:
            self.log(u"No speech intervals, hence no start found")
            if tail:
                self.real_wave_mfcc.reverse()
            return TimeValue("0.000")

        # generate a list of begin indices
        search_end = None
        candidates_begin = []
        for interval in speech_intervals:
            if (interval[0] >= min_length_frames) and (interval[0] <= max_length_frames):
                candidates_begin.append(interval[0])
            search_end = interval[1]
            if search_end >= search_window_end:
                break

        # for each begin index, compute the acm cost
        # to match the query
        # note that we take the min over the last column of the acm
        # meaning that we allow to match the entire query wave
        # against a portion of the real wave
        candidates = []
        for candidate_begin in candidates_begin:
            self.log([u"Candidate interval starting at %d == %.3f", candidate_begin, candidate_begin * mws])
            try:
                rwm = AudioFileMFCC(
                    mfcc_matrix=self.real_wave_mfcc.all_mfcc[:, candidate_begin:search_end],
                    rconf=self.rconf,
                    logger=self.logger
                )
                dtw = DTWAligner(
                    real_wave_mfcc=rwm,
                    synt_wave_mfcc=query_mfcc,
                    rconf=self.rconf,
                    logger=self.logger
                )
                acm = dtw.compute_accumulated_cost_matrix()
                last_column = acm[:, -1]
                min_value = numpy.min(last_column)
                min_index = numpy.argmin(last_column)
                self.log([u"Candidate interval: %d %d == %.3f %.3f", candidate_begin, search_end, candidate_begin * mws, search_end * mws])
                self.log([u"  Min value: %.6f", min_value])
                self.log([u"  Min index: %d == %.3f", min_index, min_index * mws])
                candidates.append((min_value, candidate_begin, min_index))
            except Exception as exc:
                self.log_exc(u"An unexpected error occurred while running _detect", exc, False, None)

        # reverse again the real wave
        if tail:
            self.log(u"Tail => reversing real_wave_mfcc again")
            self.real_wave_mfcc.reverse()

        # return
        if len(candidates) < 1:
            self.log(u"No candidates found")
            return TimeValue("0.000")
        self.log(u"Candidates:")
        for candidate in candidates:
            self.log([u"  Value: %.6f Begin Time: %.3f Min Index: %d", candidate[0], candidate[1] * mws, candidate[2]])
        best = sorted(candidates)[0][1]
        self.log([u"Best candidate: %d == %.3f", best, best * mws])
        return best * mws
Example #23
0
 def perform(self, input_file_path, speech_length, nonspeech_length):
     audiofile = AudioFileMFCC(gf.absolute_path(input_file_path, __file__))
     audiofile.run_vad()
     self.assertEqual(len(audiofile.intervals(speech=True)), speech_length)
     self.assertEqual(len(audiofile.intervals(speech=False)), nonspeech_length)