Beispiel #1
0
def test_validate_audio_frontend():
    input_repr = 'mel128'

    # test kapre
    mk = load_audio_embedding_model(input_repr, 'env', 512, frontend='kapre')
    assert len(mk.input_shape) == 3
    # assert openl3.models._validate_audio_frontend('infer', input_repr, mk) == ('kapre', input_repr)
    assert openl3.models._validate_audio_frontend('kapre', input_repr,
                                                  mk) == ('kapre', input_repr)

    # test librosa validate
    ml = load_audio_embedding_model(input_repr, 'env', 512, frontend='librosa')
    assert len(ml.input_shape) == 4
    # assert openl3.models._validate_audio_frontend('infer', input_repr, ml) == ('librosa', input_repr)
    assert openl3.models._validate_audio_frontend('librosa', input_repr,
                                                  ml) == ('librosa',
                                                          input_repr)

    # test frontend + no input_repr
    assert openl3.models._validate_audio_frontend('kapre', None,
                                                  mk) == ('kapre', 'mel256')
    with pytest.raises(OpenL3Error):
        openl3.models._validate_audio_frontend('librosa', None, ml)

    # test mismatched frontend/model
    with pytest.raises(OpenL3Error):
        openl3.models._validate_audio_frontend('librosa', None, mk)
    with pytest.raises(OpenL3Error):
        openl3.models._validate_audio_frontend('kapre', None, ml)
Beispiel #2
0
def test_frontend(input_repr):
    # check spectrogram input size
    m = load_audio_embedding_model(input_repr, 'env', 512, frontend='librosa')
    assert m.input_shape == AUDIO_INPUT_REPR_SIZES[input_repr]
    m2 = load_audio_embedding_model(input_repr, 'env', 512, frontend='kapre')
    assert m2.input_shape == (None, 1, openl3.core.TARGET_SR)
    # compare all layers to model with frontend
    _compare_layers(m.layers[1:], m2.layers[2:], compare_shapes=True)

    with pytest.raises(OpenL3Error):
        load_audio_embedding_model(input_repr,
                                   'env',
                                   512,
                                   frontend='not-a-thing')
Beispiel #3
0
def ref_audio_model():
    input_repr, content_type, embedding_size = 'linear', 'music', 6144
    m = load_audio_embedding_model(input_repr, content_type, embedding_size)
    # assert isinstance(m.layers[1], kapre.time_frequency.Spectrogram)
    assert m.layers[1].output_shape == AUDIO_INPUT_REPR_SIZES[input_repr]
    assert m.output_shape[1] == embedding_size
    return m
Beispiel #4
0
def test_get_audio_embedding_model(input_repr, content_type, embedding_size,
                                   ref_audio_model):
    m = load_audio_embedding_model(input_repr, content_type, embedding_size)
    _compare_models(m,
                    ref_audio_model,
                    AUDIO_INPUT_REPR_SIZES[input_repr],
                    embedding_size,
                    skip_layers=[1])
Beispiel #5
0
def run(modality,
        inputs,
        output_dir=None,
        suffix=None,
        input_repr="mel256",
        content_type="music",
        audio_embedding_size=6144,
        audio_center=True,
        audio_hop_size=0.1,
        audio_batch_size=32,
        image_embedding_size=8192,
        image_batch_size=32,
        overwrite=False,
        verbose=False):
    """
    Computes and saves L3 embedding for given inputs.

    Parameters
    ----------
    modality : str
        String to specify the modalities to be processed: audio, image, or video
    inputs : list of str, or str
        File/directory path or list of file/directory paths to be processed
    output_dir : str or None
        Path to directory for saving output files. If None, output files will
        be saved to the directory containing the input file.
    suffix : str or None
        String to be appended to the output filename, i.e. <base filename>_<suffix>.npy.
        If None, then no suffix will be added, i.e. <base filename>.npy.
    input_repr : "linear", "mel128", or "mel256"
        Spectrogram representation used for model.
    content_type : "music" or "env"
        Type of content used to train embedding.
    audio_embedding_size : 6144 or 512
        Audio embedding dimensionality.
    audio_center : boolean
        If True, pads beginning of signal so timestamps correspond
        to center of window.
    audio_hop_size : float
        Hop size in seconds.
    audio_batch_size : int
        Batch size used for input to audio embedding model
    image_embedding_size : 8192 or 512
        Image embedding dimensionality.
    image_batch_size : int
        Batch size used for input to image embedding model
    overwrite : bool
        If True, overwrites existing output files
    verbose : boolean
        If True, print verbose messages.

    Returns
    -------
    """

    if isinstance(inputs, str):
        file_list = [inputs]
    elif isinstance(inputs, Iterable):
        file_list = get_file_list(inputs)
    else:
        raise OpenL3Error('Invalid input: {}'.format(str(inputs)))

    if len(file_list) == 0:
        print('openl3: No files found in {}. Aborting.'.format(str(inputs)))
        sys.exit(-1)

    # Load model
    if modality == 'audio':
        model = load_audio_embedding_model(input_repr, content_type,
                                           audio_embedding_size)

        # Process all files in the arguments
        process_audio_file(file_list,
                           output_dir=output_dir,
                           suffix=suffix,
                           model=model,
                           center=audio_center,
                           hop_size=audio_hop_size,
                           batch_size=audio_batch_size,
                           overwrite=overwrite,
                           verbose=verbose)
    elif modality == 'image':
        model = load_image_embedding_model(input_repr, content_type,
                                           image_embedding_size)

        # Process all files in the arguments
        process_image_file(file_list,
                           output_dir=output_dir,
                           suffix=suffix,
                           model=model,
                           batch_size=image_batch_size,
                           overwrite=overwrite,
                           verbose=verbose)
    elif modality == 'video':
        audio_model = load_audio_embedding_model(input_repr, content_type,
                                                 audio_embedding_size)
        image_model = load_image_embedding_model(input_repr, content_type,
                                                 image_embedding_size)

        # Process all files in the arguments
        process_video_file(file_list,
                           output_dir=output_dir,
                           suffix=suffix,
                           audio_model=audio_model,
                           image_model=image_model,
                           audio_embedding_size=audio_embedding_size,
                           audio_center=audio_center,
                           audio_hop_size=audio_hop_size,
                           audio_batch_size=audio_batch_size,
                           image_batch_size=image_batch_size,
                           image_embedding_size=image_embedding_size,
                           overwrite=overwrite,
                           verbose=verbose)
    else:
        raise OpenL3Error('Invalid modality: {}'.format(modality))

    if verbose:
        print('openl3: Done!')
Beispiel #6
0
def test_load_audio_embedding_model():
    import kapre

    m = load_audio_embedding_model('linear', 'music', 6144)
    assert isinstance(m.layers[1], kapre.time_frequency.Spectrogram)
    assert m.output_shape[1] == 6144

    first_model = m

    m = load_audio_embedding_model('linear', 'music', 512)
    assert isinstance(m.layers[1], kapre.time_frequency.Spectrogram)
    assert m.output_shape[1] == 512
    # Check model consistency
    assert isinstance(m.layers[0], type(first_model.layers[0]))
    assert len(m.layers) == len(first_model.layers)
    assert all([
        isinstance(l1, type(l2))
        for (l1, l2) in zip(m.layers[2:], first_model.layers[2:])
    ])

    m = load_audio_embedding_model('linear', 'env', 6144)
    assert isinstance(m.layers[1], kapre.time_frequency.Spectrogram)
    assert m.output_shape[1] == 6144
    # Check model consistency
    assert isinstance(m.layers[0], type(first_model.layers[0]))
    assert len(m.layers) == len(first_model.layers)
    assert all([
        isinstance(l1, type(l2))
        for (l1, l2) in zip(m.layers[2:], first_model.layers[2:])
    ])

    m = load_audio_embedding_model('linear', 'env', 512)
    assert isinstance(m.layers[1], kapre.time_frequency.Spectrogram)
    assert m.output_shape[1] == 512
    # Check model consistency
    assert isinstance(m.layers[0], type(first_model.layers[0]))
    assert len(m.layers) == len(first_model.layers)
    assert all([
        isinstance(l1, type(l2))
        for (l1, l2) in zip(m.layers[2:], first_model.layers[2:])
    ])

    m = load_audio_embedding_model('mel128', 'music', 6144)
    assert isinstance(m.layers[1], kapre.time_frequency.Melspectrogram)
    assert int(m.layers[1].weights[-1].shape[1]) == 128
    assert m.output_shape[1] == 6144
    # Check model consistency
    assert isinstance(m.layers[0], type(first_model.layers[0]))
    assert len(m.layers) == len(first_model.layers)
    assert all([
        isinstance(l1, type(l2))
        for (l1, l2) in zip(m.layers[2:], first_model.layers[2:])
    ])

    m = load_audio_embedding_model('mel128', 'music', 512)
    assert isinstance(m.layers[1], kapre.time_frequency.Melspectrogram)
    assert int(m.layers[1].weights[-1].shape[1]) == 128
    assert m.output_shape[1] == 512
    # Check model consistency
    assert isinstance(m.layers[0], type(first_model.layers[0]))
    assert len(m.layers) == len(first_model.layers)
    assert all([
        isinstance(l1, type(l2))
        for (l1, l2) in zip(m.layers[2:], first_model.layers[2:])
    ])

    m = load_audio_embedding_model('mel128', 'env', 6144)
    assert isinstance(m.layers[1], kapre.time_frequency.Melspectrogram)
    assert int(m.layers[1].weights[-1].shape[1]) == 128
    assert m.output_shape[1] == 6144
    # Check model consistency
    assert isinstance(m.layers[0], type(first_model.layers[0]))
    assert len(m.layers) == len(first_model.layers)
    assert all([
        isinstance(l1, type(l2))
        for (l1, l2) in zip(m.layers[2:], first_model.layers[2:])
    ])

    m = load_audio_embedding_model('mel128', 'env', 512)
    assert isinstance(m.layers[1], kapre.time_frequency.Melspectrogram)
    assert int(m.layers[1].weights[-1].shape[1]) == 128
    assert m.output_shape[1] == 512
    # Check model consistency
    assert isinstance(m.layers[0], type(first_model.layers[0]))
    assert len(m.layers) == len(first_model.layers)
    assert all([
        isinstance(l1, type(l2))
        for (l1, l2) in zip(m.layers[2:], first_model.layers[2:])
    ])

    m = load_audio_embedding_model('mel256', 'music', 6144)
    assert isinstance(m.layers[1], kapre.time_frequency.Melspectrogram)
    assert int(m.layers[1].weights[-1].shape[1]) == 256
    assert m.output_shape[1] == 6144
    # Check model consistency
    assert isinstance(m.layers[0], type(first_model.layers[0]))
    assert len(m.layers) == len(first_model.layers)
    assert all([
        isinstance(l1, type(l2))
        for (l1, l2) in zip(m.layers[2:], first_model.layers[2:])
    ])

    m = load_audio_embedding_model('mel256', 'music', 512)
    assert isinstance(m.layers[1], kapre.time_frequency.Melspectrogram)
    assert int(m.layers[1].weights[-1].shape[1]) == 256
    assert m.output_shape[1] == 512
    # Check model consistency
    assert isinstance(m.layers[0], type(first_model.layers[0]))
    assert len(m.layers) == len(first_model.layers)
    assert all([
        isinstance(l1, type(l2))
        for (l1, l2) in zip(m.layers[2:], first_model.layers[2:])
    ])

    m = load_audio_embedding_model('mel256', 'env', 6144)
    assert isinstance(m.layers[1], kapre.time_frequency.Melspectrogram)
    assert int(m.layers[1].weights[-1].shape[1]) == 256
    assert m.output_shape[1] == 6144
    # Check model consistency
    assert isinstance(m.layers[0], type(first_model.layers[0]))
    assert len(m.layers) == len(first_model.layers)
    assert all([
        isinstance(l1, type(l2))
        for (l1, l2) in zip(m.layers[2:], first_model.layers[2:])
    ])

    m = load_audio_embedding_model('mel256', 'env', 512)
    assert isinstance(m.layers[1], kapre.time_frequency.Melspectrogram)
    assert int(m.layers[1].weights[-1].shape[1]) == 256
    assert m.output_shape[1] == 512
    # Check model consistency
    assert isinstance(m.layers[0], type(first_model.layers[0]))
    assert len(m.layers) == len(first_model.layers)
    assert all([
        isinstance(l1, type(l2))
        for (l1, l2) in zip(m.layers[2:], first_model.layers[2:])
    ])