def test_convert_to_sequences():
    feature_extractor = FeatureExtractor(pad_mode="constant")
    audio_rep = np.zeros((feature_extractor.sequence_frames * 2, 2))  # (64, 2)
    frames = feature_extractor.convert_to_sequences(audio_rep)  # (3, 32, 2)
    assert frames.shape == (3, 32, 2)

    # Check that ignore last samples
    audio_rep = np.zeros(
        (feature_extractor.sequence_frames * 2 + 10, 2))  # (74, 2)
    frames = feature_extractor.convert_to_sequences(audio_rep)  # (3, 32, 2)
    assert frames.shape == (3, 32, 2)

    feature_extractor = FeatureExtractor(sequence_hop_time=-1,
                                         pad_mode="constant")
    frames = feature_extractor.convert_to_sequences(audio_rep)  # (1, 74, 2)
    assert frames.shape == (1, 32, 2)

    feature_extractor = FeatureExtractor(sequence_hop_time=-1,
                                         sequence_time=-1,
                                         pad_mode="constant")
    frames = feature_extractor.convert_to_sequences(audio_rep)  # (1, 74, 2)
    assert frames.shape == (1, 74, 2)

    # Test it together with pad_audio
    feature_extractor = FeatureExtractor(pad_mode="constant")
    n_frames = feature_extractor.sequence_frames
    n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win
    audio = np.zeros(n_samples)
    audio_pad = feature_extractor.pad_audio(audio)
    stft = librosa.core.stft(
        audio_pad,
        n_fft=1024,
        hop_length=feature_extractor.audio_hop,
        win_length=feature_extractor.audio_win,
        center=False,
    )
    spectrogram = np.abs(stft)**2
    spectrogram = spectrogram.T
    frames = feature_extractor.convert_to_sequences(spectrogram)
    assert frames.shape == (1, feature_extractor.sequence_frames, 513)

    n_frames = 2 * feature_extractor.sequence_frames
    n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win
    audio = np.zeros(n_samples)
    audio_pad = feature_extractor.pad_audio(audio)
    stft = librosa.core.stft(
        audio_pad,
        n_fft=1024,
        hop_length=feature_extractor.audio_hop,
        win_length=feature_extractor.audio_win,
        center=False,
    )
    spectrogram = np.abs(stft)**2
    spectrogram = spectrogram.T
    frames = feature_extractor.convert_to_sequences(spectrogram)
    assert frames.shape == (3, feature_extractor.sequence_frames, 513)
Exemple #2
0
def test_pad_audio():
    # No sequence slicing
    # sequence_frames = 32
    feature_extractor = FeatureExtractor(sequence_hop_time=-1,
                                         pad_mode='constant')
    n_frames = feature_extractor.sequence_frames - 1
    n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win
    audio = np.ones(n_samples)
    audio_pad = feature_extractor.pad_audio(audio)
    assert len(
        audio_pad
    ) == feature_extractor.sequence_frames * feature_extractor.audio_hop + feature_extractor.audio_win

    # No sequence slicing, audio larger than sequence_time
    n_frames = feature_extractor.sequence_frames + 1
    n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win
    audio = np.ones(n_samples)
    audio_pad = feature_extractor.pad_audio(audio)
    assert len(
        audio_pad
    ) == feature_extractor.sequence_frames * feature_extractor.audio_hop + feature_extractor.audio_win

    # Sequence slicing, audio shorter than one sequence
    # sequence_frames = 32
    # sequence_hop = 16
    feature_extractor = FeatureExtractor(pad_mode='constant')
    n_frames = feature_extractor.sequence_frames - 1
    n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win
    audio = np.ones(n_samples)
    audio_pad = feature_extractor.pad_audio(audio)
    assert len(
        audio_pad
    ) == feature_extractor.sequence_frames * feature_extractor.audio_hop + feature_extractor.audio_win

    # Sequence slicing, audio larger than one sequence
    # sequence_frames = 32
    # sequence_hop = 16
    feature_extractor = FeatureExtractor(pad_mode='constant')
    n_frames = feature_extractor.sequence_frames + 1
    n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win
    audio = np.ones(n_samples)
    audio_pad = feature_extractor.pad_audio(audio)
    assert len(audio_pad) == (
        feature_extractor.sequence_frames + feature_extractor.sequence_hop
    ) * feature_extractor.audio_hop + feature_extractor.audio_win

    # Sequence slicing, audio length equal to two sequences
    feature_extractor = FeatureExtractor(pad_mode='constant')
    n_frames = 2 * feature_extractor.sequence_frames
    n_samples = n_frames * feature_extractor.audio_hop + feature_extractor.audio_win
    audio = np.ones(n_samples)
    audio_pad = feature_extractor.pad_audio(audio)
    assert len(audio_pad) == (
        2 * feature_extractor.sequence_frames
    ) * feature_extractor.audio_hop + feature_extractor.audio_win