def test_encode_label_ends_at_utterance_end(self): track = tracks.FileTrack('file1', resources.sample_wav_file('med_len.wav')) utt = tracks.Utterance('utt1', track, start=3, end=14) ll = annotations.LabelList(labels=[ annotations.Label('speech', 0, 4), annotations.Label('music', 4, 9), annotations.Label('speech', 9, float('inf')), ]) utt.set_label_list(ll) enc = encoding.FrameHotEncoder(['music', 'speech', 'noise'], 'default', frame_settings=units.FrameSettings( 32000, 16000), sr=16000) actual = enc.encode_utterance(utt) expected = np.array([ [0, 1, 0], [0, 1, 0], [0, 1, 0], [1, 1, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 1, 0], [0, 1, 0], ]).astype(np.float32) assert np.array_equal(expected, actual)
def test_encode_full_utterance(self): ds = resources.create_multi_label_corpus() enc = encoding.FrameHotEncoder(['music', 'speech', 'noise'], 'default', frame_settings=units.FrameSettings( 32000, 16000), sr=16000) actual = enc.encode_utterance(ds.utterances['utt-6']) expected = np.array([ [1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], [1, 0, 0], [1, 0, 0], ]).astype(np.float32) assert np.array_equal(expected, actual)
def process_file(self, file_path, frame_size=400, hop_size=160, sr=None, start=0, end=-1, utterance=None, corpus=None): """ Process the audio-file in **offline** mode, in one go. Args: file_path (str): The audio file to process. frame_size (int): The number of samples per frame. hop_size (int): The number of samples between two frames. sr (int): Use the given sampling rate. If None uses the native sampling rate from the underlying data. start (float): The point within the file in seconds to start processing from. end (float): The point within the file in seconds to end processing. utterance (Utterance): The utterance that is associated with this file, if available. corpus (Corpus): The corpus this file is part of, if available. Returns: np.ndarray: The processed features. """ frame_settings = units.FrameSettings(frame_size, hop_size) if end > 0: samples, sr = librosa.core.load(file_path, sr=sr, offset=start, duration=end - start) else: samples, sr = librosa.core.load(file_path, sr=sr, offset=start) if samples.size <= 0: raise ValueError('File {} has no samples'.format(file_path)) # Pad with zeros to match frames num_frames = frame_settings.num_frames(samples.size) num_pad_samples = (num_frames - 1) * hop_size + frame_size if num_pad_samples > samples.size: samples = np.pad(samples, (0, num_pad_samples - samples.size), mode='constant', constant_values=0) # Get sampling-rate if not given sampling_rate = sr or utterance.sampling_rate frames = librosa.util.frame(samples, frame_length=frame_size, hop_length=hop_size).T return self.process_frames(frames, sampling_rate, 0, last=True, utterance=utterance, corpus=corpus)
def test_encode_utterance(self): ds = resources.create_multi_label_corpus() enc = label_encoding.FrameOrdinalEncoder( ['music', 'speech', 'noise'], frame_settings=units.FrameSettings(32000, 16000), sr=16000) actual = enc.encode(ds.utterances['utt-6']) expected = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0]).astype(np.int) assert np.array_equal(expected, actual)
def test_encode_utterance_takes_lower_index_first(self): file = assets.File('file-idx', resources.sample_wav_file('wav_1.wav')) utt = assets.Utterance('utt-idx', file, start=0, end=5) ll = assets.LabelList( labels=[assets.Label('music', 0, 3), assets.Label('speech', 3, 5)]) utt.set_label_list(ll) enc = label_encoding.FrameOrdinalEncoder( ['speech', 'music', 'noise'], frame_settings=units.FrameSettings(32000, 16000), sr=16000) actual = enc.encode(utt) expected = np.array([1, 1, 0, 0]).astype(np.int) assert np.array_equal(expected, actual)
def test_encode_utterance_takes_larger_label(self): file = tracks.FileTrack('file-idx', resources.sample_wav_file('wav_1.wav')) utt = tracks.Utterance('utt-idx', file, start=0, end=8) ll = annotations.LabelList(labels=[ annotations.Label('music', 0, 4.5), annotations.Label('speech', 4.5, 8) ]) utt.set_label_list(ll) enc = encoding.FrameOrdinalEncoder(['music', 'speech', 'noise'], 'default', frame_settings=units.FrameSettings( 32000, 16000), sr=16000) actual = enc.encode_utterance(utt) expected = np.array([0, 0, 0, 0, 1, 1, 1]).astype(np.int) assert np.array_equal(expected, actual)
def test_time_range_to_frame_range(self, frame_size, hop_size, start_time, end_time, sr, start_index, end_index): f = units.FrameSettings(frame_size, hop_size) assert f.time_range_to_frame_range(start_time, end_time, sr) == (start_index, end_index)
def test_frame_to_seconds(self, frame_size, hop_size, frame_index, sr, start, end): f = units.FrameSettings(frame_size, hop_size) assert f.frame_to_seconds(frame_index, sr) == (start, end)
def test_sample_to_frame_range(self, frame_size, hop_size, sample_index, start_frame, end_frame): f = units.FrameSettings(frame_size, hop_size) assert f.sample_to_frame_range(sample_index) == (start_frame, end_frame)
def test_frame_to_sample(self, frame_size, hop_size, frame_index, start_sample, end_sample): f = units.FrameSettings(frame_size, hop_size) assert f.frame_to_sample(frame_index) == (start_sample, end_sample)
def test_num_frames(self, frame_size, hop_size, num_samples, num_frames): f = units.FrameSettings(frame_size, hop_size) assert f.num_frames(num_samples) == num_frames