def create_sample_dataset(temp_dir): ds = audiomate.Corpus(str(temp_dir)) file_1_path = resources.sample_wav_file('wav_1.wav') file_2_path = resources.sample_wav_file('wav_2.wav') file_3_path = resources.get_resource_path( ['audio_formats', 'flac_1_16k_16b.flac']) file_1 = ds.new_file(file_1_path, track_idx='wav_1') file_2 = ds.new_file(file_2_path, track_idx='wav_2') file_3 = ds.new_file(file_3_path, track_idx='wav_3') issuer_1 = Speaker('spk-1', gender=Gender.MALE) issuer_2 = Speaker('spk-2', gender=Gender.FEMALE) issuer_3 = Issuer('spk-3') ds.import_issuers([issuer_1, issuer_2, issuer_3]) # 2.5951875 utt_1 = ds.new_utterance('utt-1', file_1.idx, issuer_idx=issuer_1.idx) utt_2 = ds.new_utterance('utt-2', file_2.idx, issuer_idx=issuer_2.idx, start=0, end=1.5) utt_3 = ds.new_utterance('utt-3', file_2.idx, issuer_idx=issuer_2.idx, start=1.5, end=2.5) # 5.0416875 utt_4 = ds.new_utterance('utt-4', file_3.idx, issuer_idx=issuer_3.idx) utt_1.set_label_list( LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT, labels=[Label('who am i')])) utt_2.set_label_list( LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT, labels=[Label('who are you')])) utt_3.set_label_list( LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT, labels=[Label('who is he')])) utt_4.set_label_list( LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT, labels=[Label('who are they')])) train_filter = subview.MatchingUtteranceIdxFilter( utterance_idxs={'utt-1', 'utt-2', 'utt-3'}) sv_train = subview.Subview(ds, filter_criteria=[train_filter]) dev_filter = subview.MatchingUtteranceIdxFilter(utterance_idxs={'utt-4'}) sv_dev = subview.Subview(ds, filter_criteria=[dev_filter]) ds.import_subview('train', sv_train) ds.import_subview('dev', sv_dev) return ds
def test_validate_passes(self): corpus = audiomate.Corpus() corpus.new_file(resources.sample_wav_file('wav_1.wav'), 'wav1') corpus.new_file(resources.sample_wav_file('wav_2.wav'), 'wav2') val = validation.TrackReadValidator() res = val.validate(corpus) assert res.passed
def test_validate_doesnt_pass(self): corpus = audiomate.Corpus() corpus.new_file(resources.sample_wav_file('wav_1.wav'), 'wav1') corpus.new_file(resources.sample_wav_file('invalid_audio.wav'), 'wav2') val = validation.TrackReadValidator() res = val.validate(corpus) assert not res.passed assert len(res.invalid_items) == 1 assert 'wav2' in res.invalid_items
def test_compute_online(self): # Data: 41523 samples, 16 kHz # yields 40 frames with frame-size 2048 and hop-size 1024 test_file_path = resources.sample_wav_file('wav_1.wav') y, sr = librosa.load(test_file_path, sr=None) # EXPECTED y_pad = np.pad(y, (0, 1024), mode='constant', constant_values=0) S = np.abs( librosa.stft(y_pad, center=False, n_fft=2048, hop_length=1024))**2 S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr) S = librosa.power_to_db(S) onsets = librosa.onset.onset_strength(S=S, center=False) exp_tgram = librosa.feature.tempogram(onset_envelope=onsets, sr=sr, win_length=4, center=True).T # ACTUAL tgram_step = pipeline.Tempogram(win_length=4) tgram_gen = tgram_step.process_file_online(test_file_path, 2048, 1024, chunk_size=5) chunks = list(tgram_gen) tgrams = np.vstack(chunks) assert np.allclose(tgrams, exp_tgram)
def test_compute_online(self): test_file_path = resources.sample_wav_file('wav_1.wav') y, sr = librosa.load(test_file_path, sr=None) # EXPECTED y_pad = np.pad(y, (0, 1024), mode='constant', constant_values=0) S = np.abs( librosa.stft(y_pad, center=False, n_fft=2048, hop_length=1024))**2 S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr) S = librosa.power_to_db(S) exp_onsets = librosa.onset.onset_strength(S=S, center=False).T exp_onsets = exp_onsets.reshape(exp_onsets.shape[0], 1) # ACTUAL onset = pipeline.OnsetStrength() onset_gen = onset.process_file_online(test_file_path, 2048, 1024, chunk_size=5) chunks = list(onset_gen) onsets = np.vstack(chunks) print(onsets.shape, exp_onsets.shape) assert np.allclose(onsets, exp_onsets)
def test_does_utt_match_target_format_returns_true(self): file_path = resources.sample_wav_file('wav_1.wav') track = tracks.FileTrack('t', file_path) utt = tracks.Utterance('u', track) c = conversion.WavAudioFileConverter() assert c._does_utt_match_target_format(utt)
def ds(): ds = resources.create_dataset() file_1_path = resources.sample_wav_file('wav_1.wav') file_2_path = resources.get_resource_path( ('audio_formats', 'mp3_2_44_1k_16b.mp3')) file_3_path = resources.get_resource_path( ('audio_formats', 'flac_1_16k_16b.flac')) file_4_path = resources.sample_wav_file('wav_4.wav') ds.tracks['wav-1'].path = file_1_path ds.tracks['wav_2'].path = file_2_path ds.tracks['wav_3'].path = file_3_path ds.tracks['wav_4'].path = file_4_path return ds
def test_compute_cleanup_after_one_utterance(self): test_file_path = resources.sample_wav_file('wav_1.wav') y, sr = librosa.load(test_file_path, sr=None) frames = librosa.util.frame(y, frame_length=2048, hop_length=1024).T # EXPECTED S = np.abs(librosa.stft(y, center=False, n_fft=2048, hop_length=1024))**2 S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr) S = librosa.power_to_db(S) onsets = librosa.onset.onset_strength(S=S, center=False) exp_tgram = librosa.feature.tempogram(onset_envelope=onsets, sr=sr, win_length=11, center=True).T # ACTUAL tgram_step = pipeline.Tempogram(win_length=11) # FIRST RUN tgrams = tgram_step.process_frames(frames, sr, last=True) assert np.allclose(tgrams, exp_tgram) # SECOND RUN tgrams = tgram_step.process_frames(frames, sr, last=True) assert np.allclose(tgrams, exp_tgram)
def test_read_samples(self): file = assets.File('wav', resources.sample_wav_file('wav_1.wav')) issuer = assets.Issuer('toni') utt = assets.Utterance('test', file, issuer=issuer, start=1.0, end=2.30) l1 = assets.Label('a', 0.15, 0.448) l2 = assets.Label('a', 0.5, 0.73) ll = assets.LabelList(labels=[l1, l2]) utt.set_label_list(ll) expected, __ = librosa.core.load(file.path, sr=None, offset=1.15, duration=0.298) assert np.array_equal(l1.read_samples(), expected) expected, __ = librosa.core.load(file.path, sr=None, offset=1.5, duration=0.23) assert np.array_equal(l2.read_samples(), expected)
def test_read_samples(self): path = resources.sample_wav_file('wav_1.wav') track = tracks.FileTrack('wav', path) issuer = issuers.Issuer('toni') utt = tracks.Utterance('t', track, issuer=issuer, start=1.0, end=2.30) l1 = annotations.Label('a', 0.15, 0.448) l2 = annotations.Label('a', 0.5, 0.73) ll = annotations.LabelList(labels=[l1, l2]) utt.set_label_list(ll) expected, __ = librosa.core.load(path, sr=None, offset=1.15, duration=0.298) assert np.array_equal(l1.read_samples(), expected) expected, __ = librosa.core.load(path, sr=None, offset=1.5, duration=1.73 - 1.5) print(expected.shape) print(l2.read_samples().shape) assert np.array_equal(l2.read_samples(), expected)
def setUp(self): file = assets.File('wav', resources.sample_wav_file('wav_1.wav')) utt = assets.Utterance('utt', file, start=0.3, end=-1) ll = assets.LabelList() self.test_label = assets.Label('a', start=0.5, end=-1) ll.append(self.test_label) utt.set_label_list(ll)
def test_encode_label_ends_at_utterance_end(self): track = tracks.FileTrack('file1', resources.sample_wav_file('med_len.wav')) utt = tracks.Utterance('utt1', track, start=3, end=14) ll = annotations.LabelList(labels=[ annotations.Label('speech', 0, 4), annotations.Label('music', 4, 9), annotations.Label('speech', 9, float('inf')), ]) utt.set_label_list(ll) enc = encoding.FrameHotEncoder(['music', 'speech', 'noise'], 'default', frame_settings=units.FrameSettings( 32000, 16000), sr=16000) actual = enc.encode_utterance(utt) expected = np.array([ [0, 1, 0], [0, 1, 0], [0, 1, 0], [1, 1, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 1, 0], [0, 1, 0], ]).astype(np.float32) assert np.array_equal(expected, actual)
def test_save_file_tracks(self, writer, sample_corpus, tmpdir): # make sure relative path changes in contrast to self.ds.path out_path = os.path.join(tmpdir.strpath, 'somesubdir') os.makedirs(out_path) writer.save(sample_corpus, out_path) file_1_path = os.path.relpath(resources.sample_wav_file('wav_1.wav'), out_path) file_2_path = os.path.relpath(resources.sample_wav_file('wav_2.wav'), out_path) file_3_path = os.path.relpath(resources.sample_wav_file('wav_3.wav'), out_path) file_4_path = os.path.relpath(resources.sample_wav_file('wav_4.wav'), out_path) with open(os.path.join(out_path, 'files.txt'), 'r') as f: file_content = f.read() assert file_content.strip( ) == 'wav-1 {}\nwav_2 {}\nwav_3 {}\nwav_4 {}'.format( file_1_path, file_2_path, file_3_path, file_4_path)
def test_convert_files(self, tmp_path): source_path = resources.sample_wav_file('wav_1.wav') target_path = tmp_path / 'out.wav' files = [(source_path, 0, float('inf'), str(target_path))] c = conversion.WavAudioFileConverter() c._convert_files(files) samples, sr = librosa.core.load(source_path, sr=None) stored_samples, stored_sr = librosa.core.load(str(target_path), sr=None) assert target_path.is_file() assert stored_sr == sr assert np.array_equal(stored_samples, samples)
def test_encode_utterance_takes_lower_index_first(self): file = assets.File('file-idx', resources.sample_wav_file('wav_1.wav')) utt = assets.Utterance('utt-idx', file, start=0, end=5) ll = assets.LabelList( labels=[assets.Label('music', 0, 3), assets.Label('speech', 3, 5)]) utt.set_label_list(ll) enc = label_encoding.FrameOrdinalEncoder( ['speech', 'music', 'noise'], frame_settings=units.FrameSettings(32000, 16000), sr=16000) actual = enc.encode(utt) expected = np.array([1, 1, 0, 0]).astype(np.int) assert np.array_equal(expected, actual)
def setup_method(self): self.ll_1 = annotations.LabelList(idx='alpha', labels=[ annotations.Label('a', 3.2, 4.5), annotations.Label('b', 5.1, 8.9), annotations.Label( 'c', 7.2, 10.5), annotations.Label('d', 10.5, 14), annotations.Label('d', 15, 18) ]) self.ll_2 = annotations.LabelList(idx='bravo', labels=[ annotations.Label('a', 1.0, 4.2), annotations.Label('e', 4.2, 7.9), annotations.Label( 'c', 7.2, 10.5), annotations.Label('f', 10.5, 14), annotations.Label('d', 15, 17.3) ]) self.ll_duplicate_idx = annotations.LabelList( idx='charlie', labels=[ annotations.Label('t', 1.0, 4.2), annotations.Label('h', 4.2, 7.9) ]) self.ll_3 = annotations.LabelList(idx='charlie', labels=[ annotations.Label('a', 1.0, 4.2), annotations.Label('g', 4.2, 7.9) ]) self.track = tracks.FileTrack('wav', resources.sample_wav_file('wav_1.wav')) self.issuer = issuers.Issuer('toni') self.utt = tracks.Utterance('test', self.track, issuer=self.issuer, start=1.25, end=1.30, label_lists=[ self.ll_1, self.ll_2, self.ll_duplicate_idx, self.ll_3 ])
def test_store_samples_sr_24(self, tmp_path): source_path = resources.sample_wav_file('wav_1.wav') target_path = tmp_path / 'out.wav' files = [(source_path, 0, float('inf'), str(target_path))] c = conversion.WavAudioFileConverter(sampling_rate=24000) c._convert_files(files) samples, sr = librosa.core.load(source_path, sr=24000) stored_samples, stored_sr = librosa.core.load(str(target_path), sr=None) assert target_path.is_file() assert stored_sr == sr # Don't take it too exactly # With sox 14.4.1 it isn't that precise, expecially the first sample assert np.allclose(stored_samples[1:], samples[1:], atol=0.001)
def test_compute(self): test_file_path = resources.sample_wav_file('wav_1.wav') y, sr = librosa.load(test_file_path, sr=None) frames = librosa.util.frame(y, frame_length=2048, hop_length=1024).T # EXPECTED S = np.abs(librosa.stft(y, center=False, n_fft=2048, hop_length=1024))**2 S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr) S = librosa.power_to_db(S) exp_onsets = librosa.onset.onset_strength(S=S, center=False).T exp_onsets = exp_onsets.reshape(exp_onsets.shape[0], 1) # ACTUAL onset = pipeline.OnsetStrength() onsets = onset.process_frames(frames, sr, last=True) assert np.allclose(onsets, exp_onsets)
def test_encode_utterance_takes_larger_label(self): file = tracks.FileTrack('file-idx', resources.sample_wav_file('wav_1.wav')) utt = tracks.Utterance('utt-idx', file, start=0, end=8) ll = annotations.LabelList(labels=[ annotations.Label('music', 0, 4.5), annotations.Label('speech', 4.5, 8) ]) utt.set_label_list(ll) enc = encoding.FrameOrdinalEncoder(['music', 'speech', 'noise'], 'default', frame_settings=units.FrameSettings( 32000, 16000), sr=16000) actual = enc.encode_utterance(utt) expected = np.array([0, 0, 0, 0, 1, 1, 1]).astype(np.int) assert np.array_equal(expected, actual)
def sample_utterance(): file_track = tracks.FileTrack('test_file', resources.sample_wav_file('wav_1.wav')) utterance = tracks.Utterance('test', file_track) return utterance
def sample_utterance(): file = assets.File('test_file', resources.sample_wav_file('wav_1.wav')) utterance = assets.Utterance('test', file) return utterance