コード例 #1
0
ファイル: test_nvidia_jasper.py プロジェクト: ynop/audiomate
def create_sample_dataset(temp_dir):
    ds = audiomate.Corpus(str(temp_dir))

    file_1_path = resources.sample_wav_file('wav_1.wav')
    file_2_path = resources.sample_wav_file('wav_2.wav')
    file_3_path = resources.get_resource_path(
        ['audio_formats', 'flac_1_16k_16b.flac'])

    file_1 = ds.new_file(file_1_path, track_idx='wav_1')
    file_2 = ds.new_file(file_2_path, track_idx='wav_2')
    file_3 = ds.new_file(file_3_path, track_idx='wav_3')

    issuer_1 = Speaker('spk-1', gender=Gender.MALE)
    issuer_2 = Speaker('spk-2', gender=Gender.FEMALE)
    issuer_3 = Issuer('spk-3')

    ds.import_issuers([issuer_1, issuer_2, issuer_3])

    # 2.5951875
    utt_1 = ds.new_utterance('utt-1', file_1.idx, issuer_idx=issuer_1.idx)
    utt_2 = ds.new_utterance('utt-2',
                             file_2.idx,
                             issuer_idx=issuer_2.idx,
                             start=0,
                             end=1.5)
    utt_3 = ds.new_utterance('utt-3',
                             file_2.idx,
                             issuer_idx=issuer_2.idx,
                             start=1.5,
                             end=2.5)
    # 5.0416875
    utt_4 = ds.new_utterance('utt-4', file_3.idx, issuer_idx=issuer_3.idx)

    utt_1.set_label_list(
        LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT,
                  labels=[Label('who am i')]))
    utt_2.set_label_list(
        LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT,
                  labels=[Label('who are you')]))
    utt_3.set_label_list(
        LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT,
                  labels=[Label('who is he')]))
    utt_4.set_label_list(
        LabelList(audiomate.corpus.LL_WORD_TRANSCRIPT,
                  labels=[Label('who are they')]))

    train_filter = subview.MatchingUtteranceIdxFilter(
        utterance_idxs={'utt-1', 'utt-2', 'utt-3'})
    sv_train = subview.Subview(ds, filter_criteria=[train_filter])

    dev_filter = subview.MatchingUtteranceIdxFilter(utterance_idxs={'utt-4'})
    sv_dev = subview.Subview(ds, filter_criteria=[dev_filter])

    ds.import_subview('train', sv_train)
    ds.import_subview('dev', sv_dev)

    return ds
コード例 #2
0
    def test_validate_passes(self):
        corpus = audiomate.Corpus()
        corpus.new_file(resources.sample_wav_file('wav_1.wav'), 'wav1')
        corpus.new_file(resources.sample_wav_file('wav_2.wav'), 'wav2')

        val = validation.TrackReadValidator()
        res = val.validate(corpus)

        assert res.passed
コード例 #3
0
    def test_validate_doesnt_pass(self):
        corpus = audiomate.Corpus()
        corpus.new_file(resources.sample_wav_file('wav_1.wav'), 'wav1')
        corpus.new_file(resources.sample_wav_file('invalid_audio.wav'), 'wav2')

        val = validation.TrackReadValidator()
        res = val.validate(corpus)

        assert not res.passed
        assert len(res.invalid_items) == 1
        assert 'wav2' in res.invalid_items
コード例 #4
0
    def test_compute_online(self):
        # Data: 41523 samples, 16 kHz
        # yields 40 frames with frame-size 2048 and hop-size 1024
        test_file_path = resources.sample_wav_file('wav_1.wav')
        y, sr = librosa.load(test_file_path, sr=None)

        # EXPECTED
        y_pad = np.pad(y, (0, 1024), mode='constant', constant_values=0)
        S = np.abs(
            librosa.stft(y_pad, center=False, n_fft=2048, hop_length=1024))**2
        S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr)
        S = librosa.power_to_db(S)
        onsets = librosa.onset.onset_strength(S=S, center=False)
        exp_tgram = librosa.feature.tempogram(onset_envelope=onsets,
                                              sr=sr,
                                              win_length=4,
                                              center=True).T

        # ACTUAL
        tgram_step = pipeline.Tempogram(win_length=4)
        tgram_gen = tgram_step.process_file_online(test_file_path,
                                                   2048,
                                                   1024,
                                                   chunk_size=5)

        chunks = list(tgram_gen)
        tgrams = np.vstack(chunks)

        assert np.allclose(tgrams, exp_tgram)
コード例 #5
0
    def test_compute_online(self):
        test_file_path = resources.sample_wav_file('wav_1.wav')
        y, sr = librosa.load(test_file_path, sr=None)

        # EXPECTED
        y_pad = np.pad(y, (0, 1024), mode='constant', constant_values=0)
        S = np.abs(
            librosa.stft(y_pad, center=False, n_fft=2048, hop_length=1024))**2
        S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr)
        S = librosa.power_to_db(S)
        exp_onsets = librosa.onset.onset_strength(S=S, center=False).T
        exp_onsets = exp_onsets.reshape(exp_onsets.shape[0], 1)

        # ACTUAL
        onset = pipeline.OnsetStrength()
        onset_gen = onset.process_file_online(test_file_path,
                                              2048,
                                              1024,
                                              chunk_size=5)

        chunks = list(onset_gen)
        onsets = np.vstack(chunks)

        print(onsets.shape, exp_onsets.shape)

        assert np.allclose(onsets, exp_onsets)
コード例 #6
0
    def test_does_utt_match_target_format_returns_true(self):
        file_path = resources.sample_wav_file('wav_1.wav')
        track = tracks.FileTrack('t', file_path)
        utt = tracks.Utterance('u', track)

        c = conversion.WavAudioFileConverter()
        assert c._does_utt_match_target_format(utt)
コード例 #7
0
ファイル: test_base.py プロジェクト: ynop/audiomate
def ds():
    ds = resources.create_dataset()

    file_1_path = resources.sample_wav_file('wav_1.wav')
    file_2_path = resources.get_resource_path(
        ('audio_formats', 'mp3_2_44_1k_16b.mp3'))
    file_3_path = resources.get_resource_path(
        ('audio_formats', 'flac_1_16k_16b.flac'))
    file_4_path = resources.sample_wav_file('wav_4.wav')

    ds.tracks['wav-1'].path = file_1_path
    ds.tracks['wav_2'].path = file_2_path
    ds.tracks['wav_3'].path = file_3_path
    ds.tracks['wav_4'].path = file_4_path

    return ds
コード例 #8
0
    def test_compute_cleanup_after_one_utterance(self):
        test_file_path = resources.sample_wav_file('wav_1.wav')
        y, sr = librosa.load(test_file_path, sr=None)
        frames = librosa.util.frame(y, frame_length=2048, hop_length=1024).T

        # EXPECTED
        S = np.abs(librosa.stft(y, center=False, n_fft=2048,
                                hop_length=1024))**2
        S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr)
        S = librosa.power_to_db(S)
        onsets = librosa.onset.onset_strength(S=S, center=False)
        exp_tgram = librosa.feature.tempogram(onset_envelope=onsets,
                                              sr=sr,
                                              win_length=11,
                                              center=True).T

        # ACTUAL
        tgram_step = pipeline.Tempogram(win_length=11)

        # FIRST RUN
        tgrams = tgram_step.process_frames(frames, sr, last=True)

        assert np.allclose(tgrams, exp_tgram)

        # SECOND RUN
        tgrams = tgram_step.process_frames(frames, sr, last=True)

        assert np.allclose(tgrams, exp_tgram)
コード例 #9
0
ファイル: test_label.py プロジェクト: toddrme2178/audiomate
    def test_read_samples(self):
        file = assets.File('wav', resources.sample_wav_file('wav_1.wav'))
        issuer = assets.Issuer('toni')
        utt = assets.Utterance('test',
                               file,
                               issuer=issuer,
                               start=1.0,
                               end=2.30)

        l1 = assets.Label('a', 0.15, 0.448)
        l2 = assets.Label('a', 0.5, 0.73)
        ll = assets.LabelList(labels=[l1, l2])

        utt.set_label_list(ll)

        expected, __ = librosa.core.load(file.path,
                                         sr=None,
                                         offset=1.15,
                                         duration=0.298)
        assert np.array_equal(l1.read_samples(), expected)

        expected, __ = librosa.core.load(file.path,
                                         sr=None,
                                         offset=1.5,
                                         duration=0.23)
        assert np.array_equal(l2.read_samples(), expected)
コード例 #10
0
    def test_read_samples(self):
        path = resources.sample_wav_file('wav_1.wav')
        track = tracks.FileTrack('wav', path)
        issuer = issuers.Issuer('toni')
        utt = tracks.Utterance('t', track, issuer=issuer, start=1.0, end=2.30)

        l1 = annotations.Label('a', 0.15, 0.448)
        l2 = annotations.Label('a', 0.5, 0.73)
        ll = annotations.LabelList(labels=[l1, l2])

        utt.set_label_list(ll)

        expected, __ = librosa.core.load(path,
                                         sr=None,
                                         offset=1.15,
                                         duration=0.298)
        assert np.array_equal(l1.read_samples(), expected)

        expected, __ = librosa.core.load(path,
                                         sr=None,
                                         offset=1.5,
                                         duration=1.73 - 1.5)

        print(expected.shape)
        print(l2.read_samples().shape)
        assert np.array_equal(l2.read_samples(), expected)
コード例 #11
0
ファイル: test_label.py プロジェクト: toddrme2178/audiomate
 def setUp(self):
     file = assets.File('wav', resources.sample_wav_file('wav_1.wav'))
     utt = assets.Utterance('utt', file, start=0.3, end=-1)
     ll = assets.LabelList()
     self.test_label = assets.Label('a', start=0.5, end=-1)
     ll.append(self.test_label)
     utt.set_label_list(ll)
コード例 #12
0
ファイル: test_frame_based.py プロジェクト: ynop/audiomate
    def test_encode_label_ends_at_utterance_end(self):
        track = tracks.FileTrack('file1',
                                 resources.sample_wav_file('med_len.wav'))
        utt = tracks.Utterance('utt1', track, start=3, end=14)
        ll = annotations.LabelList(labels=[
            annotations.Label('speech', 0, 4),
            annotations.Label('music', 4, 9),
            annotations.Label('speech', 9, float('inf')),
        ])
        utt.set_label_list(ll)

        enc = encoding.FrameHotEncoder(['music', 'speech', 'noise'],
                                       'default',
                                       frame_settings=units.FrameSettings(
                                           32000, 16000),
                                       sr=16000)

        actual = enc.encode_utterance(utt)
        expected = np.array([
            [0, 1, 0],
            [0, 1, 0],
            [0, 1, 0],
            [1, 1, 0],
            [1, 0, 0],
            [1, 0, 0],
            [1, 0, 0],
            [1, 0, 0],
            [1, 1, 0],
            [0, 1, 0],
        ]).astype(np.float32)

        assert np.array_equal(expected, actual)
コード例 #13
0
ファイル: test_default.py プロジェクト: ynop/audiomate
    def test_save_file_tracks(self, writer, sample_corpus, tmpdir):
        # make sure relative path changes in contrast to self.ds.path
        out_path = os.path.join(tmpdir.strpath, 'somesubdir')
        os.makedirs(out_path)

        writer.save(sample_corpus, out_path)

        file_1_path = os.path.relpath(resources.sample_wav_file('wav_1.wav'),
                                      out_path)
        file_2_path = os.path.relpath(resources.sample_wav_file('wav_2.wav'),
                                      out_path)
        file_3_path = os.path.relpath(resources.sample_wav_file('wav_3.wav'),
                                      out_path)
        file_4_path = os.path.relpath(resources.sample_wav_file('wav_4.wav'),
                                      out_path)

        with open(os.path.join(out_path, 'files.txt'), 'r') as f:
            file_content = f.read()

        assert file_content.strip(
        ) == 'wav-1 {}\nwav_2 {}\nwav_3 {}\nwav_4 {}'.format(
            file_1_path, file_2_path, file_3_path, file_4_path)
コード例 #14
0
    def test_convert_files(self, tmp_path):
        source_path = resources.sample_wav_file('wav_1.wav')
        target_path = tmp_path / 'out.wav'

        files = [(source_path, 0, float('inf'), str(target_path))]

        c = conversion.WavAudioFileConverter()
        c._convert_files(files)

        samples, sr = librosa.core.load(source_path, sr=None)

        stored_samples, stored_sr = librosa.core.load(str(target_path), sr=None)

        assert target_path.is_file()
        assert stored_sr == sr
        assert np.array_equal(stored_samples, samples)
コード例 #15
0
    def test_encode_utterance_takes_lower_index_first(self):
        file = assets.File('file-idx', resources.sample_wav_file('wav_1.wav'))
        utt = assets.Utterance('utt-idx', file, start=0, end=5)
        ll = assets.LabelList(
            labels=[assets.Label('music', 0, 3),
                    assets.Label('speech', 3, 5)])
        utt.set_label_list(ll)

        enc = label_encoding.FrameOrdinalEncoder(
            ['speech', 'music', 'noise'],
            frame_settings=units.FrameSettings(32000, 16000),
            sr=16000)

        actual = enc.encode(utt)
        expected = np.array([1, 1, 0, 0]).astype(np.int)

        assert np.array_equal(expected, actual)
コード例 #16
0
ファイル: test_utterance.py プロジェクト: ynop/audiomate
    def setup_method(self):
        self.ll_1 = annotations.LabelList(idx='alpha',
                                          labels=[
                                              annotations.Label('a', 3.2, 4.5),
                                              annotations.Label('b', 5.1, 8.9),
                                              annotations.Label(
                                                  'c', 7.2, 10.5),
                                              annotations.Label('d', 10.5, 14),
                                              annotations.Label('d', 15, 18)
                                          ])

        self.ll_2 = annotations.LabelList(idx='bravo',
                                          labels=[
                                              annotations.Label('a', 1.0, 4.2),
                                              annotations.Label('e', 4.2, 7.9),
                                              annotations.Label(
                                                  'c', 7.2, 10.5),
                                              annotations.Label('f', 10.5, 14),
                                              annotations.Label('d', 15, 17.3)
                                          ])

        self.ll_duplicate_idx = annotations.LabelList(
            idx='charlie',
            labels=[
                annotations.Label('t', 1.0, 4.2),
                annotations.Label('h', 4.2, 7.9)
            ])

        self.ll_3 = annotations.LabelList(idx='charlie',
                                          labels=[
                                              annotations.Label('a', 1.0, 4.2),
                                              annotations.Label('g', 4.2, 7.9)
                                          ])

        self.track = tracks.FileTrack('wav',
                                      resources.sample_wav_file('wav_1.wav'))
        self.issuer = issuers.Issuer('toni')
        self.utt = tracks.Utterance('test',
                                    self.track,
                                    issuer=self.issuer,
                                    start=1.25,
                                    end=1.30,
                                    label_lists=[
                                        self.ll_1, self.ll_2,
                                        self.ll_duplicate_idx, self.ll_3
                                    ])
コード例 #17
0
    def test_store_samples_sr_24(self, tmp_path):
        source_path = resources.sample_wav_file('wav_1.wav')
        target_path = tmp_path / 'out.wav'

        files = [(source_path, 0, float('inf'), str(target_path))]

        c = conversion.WavAudioFileConverter(sampling_rate=24000)
        c._convert_files(files)

        samples, sr = librosa.core.load(source_path, sr=24000)

        stored_samples, stored_sr = librosa.core.load(str(target_path), sr=None)

        assert target_path.is_file()
        assert stored_sr == sr
        # Don't take it too exactly
        # With sox 14.4.1 it isn't that precise, expecially the first sample
        assert np.allclose(stored_samples[1:], samples[1:], atol=0.001)
コード例 #18
0
    def test_compute(self):
        test_file_path = resources.sample_wav_file('wav_1.wav')
        y, sr = librosa.load(test_file_path, sr=None)
        frames = librosa.util.frame(y, frame_length=2048, hop_length=1024).T

        # EXPECTED
        S = np.abs(librosa.stft(y, center=False, n_fft=2048,
                                hop_length=1024))**2
        S = librosa.feature.melspectrogram(S=S, n_mels=128, sr=sr)
        S = librosa.power_to_db(S)
        exp_onsets = librosa.onset.onset_strength(S=S, center=False).T
        exp_onsets = exp_onsets.reshape(exp_onsets.shape[0], 1)

        # ACTUAL
        onset = pipeline.OnsetStrength()
        onsets = onset.process_frames(frames, sr, last=True)

        assert np.allclose(onsets, exp_onsets)
コード例 #19
0
    def test_encode_utterance_takes_larger_label(self):
        file = tracks.FileTrack('file-idx',
                                resources.sample_wav_file('wav_1.wav'))
        utt = tracks.Utterance('utt-idx', file, start=0, end=8)
        ll = annotations.LabelList(labels=[
            annotations.Label('music', 0, 4.5),
            annotations.Label('speech', 4.5, 8)
        ])
        utt.set_label_list(ll)

        enc = encoding.FrameOrdinalEncoder(['music', 'speech', 'noise'],
                                           'default',
                                           frame_settings=units.FrameSettings(
                                               32000, 16000),
                                           sr=16000)

        actual = enc.encode_utterance(utt)
        expected = np.array([0, 0, 0, 0, 1, 1, 1]).astype(np.int)

        assert np.array_equal(expected, actual)
コード例 #20
0
ファイル: test_base.py プロジェクト: xjc90s/audiomate
def sample_utterance():
    file_track = tracks.FileTrack('test_file', resources.sample_wav_file('wav_1.wav'))
    utterance = tracks.Utterance('test', file_track)
    return utterance
コード例 #21
0
ファイル: test_base.py プロジェクト: toddrme2178/audiomate
def sample_utterance():
    file = assets.File('test_file', resources.sample_wav_file('wav_1.wav'))
    utterance = assets.Utterance('test', file)
    return utterance