Esempio n. 1
0
    def test_encode_label_ends_at_utterance_end(self):
        track = tracks.FileTrack('file1',
                                 resources.sample_wav_file('med_len.wav'))
        utt = tracks.Utterance('utt1', track, start=3, end=14)
        ll = annotations.LabelList(labels=[
            annotations.Label('speech', 0, 4),
            annotations.Label('music', 4, 9),
            annotations.Label('speech', 9, float('inf')),
        ])
        utt.set_label_list(ll)

        enc = encoding.FrameHotEncoder(['music', 'speech', 'noise'],
                                       'default',
                                       frame_settings=units.FrameSettings(
                                           32000, 16000),
                                       sr=16000)

        actual = enc.encode_utterance(utt)
        expected = np.array([
            [0, 1, 0],
            [0, 1, 0],
            [0, 1, 0],
            [1, 1, 0],
            [1, 0, 0],
            [1, 0, 0],
            [1, 0, 0],
            [1, 0, 0],
            [1, 1, 0],
            [0, 1, 0],
        ]).astype(np.float32)

        assert np.array_equal(expected, actual)
Esempio n. 2
0
    def test_encode_full_utterance(self):
        ds = resources.create_multi_label_corpus()
        enc = encoding.FrameHotEncoder(['music', 'speech', 'noise'],
                                       'default',
                                       frame_settings=units.FrameSettings(
                                           32000, 16000),
                                       sr=16000)

        actual = enc.encode_utterance(ds.utterances['utt-6'])
        expected = np.array([
            [1, 0, 0],
            [1, 0, 0],
            [1, 0, 0],
            [1, 0, 0],
            [1, 1, 0],
            [0, 1, 0],
            [0, 1, 0],
            [0, 1, 0],
            [0, 1, 0],
            [0, 1, 0],
            [0, 1, 0],
            [0, 1, 0],
            [1, 0, 0],
            [1, 0, 0],
        ]).astype(np.float32)

        assert np.array_equal(expected, actual)
Esempio n. 3
0
    def process_file(self,
                     file_path,
                     frame_size=400,
                     hop_size=160,
                     sr=None,
                     start=0,
                     end=-1,
                     utterance=None,
                     corpus=None):
        """
        Process the audio-file in **offline** mode, in one go.

        Args:
            file_path (str): The audio file to process.
            frame_size (int): The number of samples per frame.
            hop_size (int): The number of samples between two frames.
            sr (int): Use the given sampling rate. If None uses the native sampling rate from the underlying data.
            start (float): The point within the file in seconds to start processing from.
            end (float): The point within the file in seconds to end processing.
            utterance (Utterance): The utterance that is associated with this file, if available.
            corpus (Corpus): The corpus this file is part of, if available.

        Returns:
            np.ndarray: The processed features.
        """
        frame_settings = units.FrameSettings(frame_size, hop_size)

        if end > 0:
            samples, sr = librosa.core.load(file_path,
                                            sr=sr,
                                            offset=start,
                                            duration=end - start)
        else:
            samples, sr = librosa.core.load(file_path, sr=sr, offset=start)

        if samples.size <= 0:
            raise ValueError('File {} has no samples'.format(file_path))

        # Pad with zeros to match frames
        num_frames = frame_settings.num_frames(samples.size)
        num_pad_samples = (num_frames - 1) * hop_size + frame_size

        if num_pad_samples > samples.size:
            samples = np.pad(samples, (0, num_pad_samples - samples.size),
                             mode='constant',
                             constant_values=0)

        # Get sampling-rate if not given
        sampling_rate = sr or utterance.sampling_rate

        frames = librosa.util.frame(samples,
                                    frame_length=frame_size,
                                    hop_length=hop_size).T
        return self.process_frames(frames,
                                   sampling_rate,
                                   0,
                                   last=True,
                                   utterance=utterance,
                                   corpus=corpus)
Esempio n. 4
0
    def test_encode_utterance(self):
        ds = resources.create_multi_label_corpus()
        enc = label_encoding.FrameOrdinalEncoder(
            ['music', 'speech', 'noise'],
            frame_settings=units.FrameSettings(32000, 16000),
            sr=16000)

        actual = enc.encode(ds.utterances['utt-6'])
        expected = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
                             0]).astype(np.int)

        assert np.array_equal(expected, actual)
Esempio n. 5
0
    def test_encode_utterance_takes_lower_index_first(self):
        file = assets.File('file-idx', resources.sample_wav_file('wav_1.wav'))
        utt = assets.Utterance('utt-idx', file, start=0, end=5)
        ll = assets.LabelList(
            labels=[assets.Label('music', 0, 3),
                    assets.Label('speech', 3, 5)])
        utt.set_label_list(ll)

        enc = label_encoding.FrameOrdinalEncoder(
            ['speech', 'music', 'noise'],
            frame_settings=units.FrameSettings(32000, 16000),
            sr=16000)

        actual = enc.encode(utt)
        expected = np.array([1, 1, 0, 0]).astype(np.int)

        assert np.array_equal(expected, actual)
Esempio n. 6
0
    def test_encode_utterance_takes_larger_label(self):
        file = tracks.FileTrack('file-idx',
                                resources.sample_wav_file('wav_1.wav'))
        utt = tracks.Utterance('utt-idx', file, start=0, end=8)
        ll = annotations.LabelList(labels=[
            annotations.Label('music', 0, 4.5),
            annotations.Label('speech', 4.5, 8)
        ])
        utt.set_label_list(ll)

        enc = encoding.FrameOrdinalEncoder(['music', 'speech', 'noise'],
                                           'default',
                                           frame_settings=units.FrameSettings(
                                               32000, 16000),
                                           sr=16000)

        actual = enc.encode_utterance(utt)
        expected = np.array([0, 0, 0, 0, 1, 1, 1]).astype(np.int)

        assert np.array_equal(expected, actual)
Esempio n. 7
0
 def test_time_range_to_frame_range(self, frame_size, hop_size, start_time, end_time, sr, start_index, end_index):
     f = units.FrameSettings(frame_size, hop_size)
     assert f.time_range_to_frame_range(start_time, end_time, sr) == (start_index, end_index)
Esempio n. 8
0
 def test_frame_to_seconds(self, frame_size, hop_size, frame_index, sr, start, end):
     f = units.FrameSettings(frame_size, hop_size)
     assert f.frame_to_seconds(frame_index, sr) == (start, end)
Esempio n. 9
0
 def test_sample_to_frame_range(self, frame_size, hop_size, sample_index, start_frame, end_frame):
     f = units.FrameSettings(frame_size, hop_size)
     assert f.sample_to_frame_range(sample_index) == (start_frame, end_frame)
Esempio n. 10
0
 def test_frame_to_sample(self, frame_size, hop_size, frame_index, start_sample, end_sample):
     f = units.FrameSettings(frame_size, hop_size)
     assert f.frame_to_sample(frame_index) == (start_sample, end_sample)
Esempio n. 11
0
 def test_num_frames(self, frame_size, hop_size, num_samples, num_frames):
     f = units.FrameSettings(frame_size, hop_size)
     assert f.num_frames(num_samples) == num_frames