Esempio n. 1
0
    def __init__(self, rutaCarac, dic_inf_audios=None):
        '''
        Paramteros
        ----------
        rutaCarac: str
            ruta donde guardar los archivos de características
        dic_inf_audios: dict
            diccionario donde se guardan atributos extra para cada audio.
            Clave: nombre del audio. Valor: Diccionario con claves el nombre de atributo y valor el valor.
        '''

        self.rutaCcas = '../' + rutaCarac  #i.e.: CaracteristicasExtraidas/Vggish/embeddings||espectros/

        try:
            os.mkdir(self.rutaCcas)
        except FileExistsError:
            print(
                'Directorio de características ya existente, no se crea nuevo.'
            )

        self.dic_inf_audios = dic_inf_audios

        #Definimos VGGish
        self.model = vggish_keras.get_vggish_keras()
        #Cargamos el checkpoint
        checkpoint_path = 'vggish_weights.ckpt'
        self.model.load_weights(checkpoint_path)
Esempio n. 2
0
def nonpretrained_vggish_volumetric(pretrained=True):

    base_model = get_vggish_keras()

    layer4 = base_model.get_output_at(-1)

    x = Dense(6 * 7 * 6 * 1024, activation='elu')(layer4)
    y = Reshape((6, 7, 6, 1024))(x)

    y = Conv3DTranspose(512, (3, 3, 3), (2, 2, 2), activation='elu')(y)
    y = Conv3DTranspose(256, (3, 3, 3), (2, 2, 2), activation='elu')(y)
    y = Conv3DTranspose(128, (3, 3, 3), (2, 2, 2), activation='elu')(y)
    y = Conv3DTranspose(1, (3, 3, 3), (2, 2, 2), activation='elu')(y)

    model = Model(inputs=base_model.input, outputs=y)
    return model
Esempio n. 3
0
def nonpretrained_vggish_volumetric_context(
        img_shape=(20, 96, 64, 1), pretrained=True):

    base_model = get_vggish_keras()

    layer4 = base_model.get_output_at(-1)

    singleframe_model = Model(inputs=base_model.input, outputs=layer4)

    sequence = Input(shape=img_shape, dtype='float32')
    x = TimeDistributed(singleframe_model)(sequence)
    x = LSTM(512)(x)

    x = Dense(6 * 7 * 6 * 1024, activation='elu')(x)
    y = Reshape((6, 7, 6, 1024))(x)

    y = Conv3DTranspose(512, (3, 3, 3), (2, 2, 2), activation='elu')(y)
    y = Conv3DTranspose(256, (3, 3, 3), (2, 2, 2), activation='elu')(y)
    y = Conv3DTranspose(128, (3, 3, 3), (2, 2, 2), activation='elu')(y)
    out = Conv3DTranspose(1, (3, 3, 3), (2, 2, 2), activation='elu')(y)
    model = Model(inputs=sequence, outputs=out)

    return model
    vggish_params.OUTPUT_TENSOR_NAME)

pproc = vggish_postprocess.Postprocessor('vggish_pca_params.npz')

weights = {}
operations = sess.graph.get_operations()
for op in operations:
    name = op.name
    if 'read' in name:
        name2 = name.replace('vggish/', '').replace('/read', '').replace(
            'conv3/', '').replace('conv4/', '').replace('/fc1', '')
        name2_layer, name2_type = name2.split('/')
        if name2_type == 'weights':
            weights[name2_layer] = []
            weights[name2_layer].append(sess.run(op.values())[0])

for op in operations:
    name = op.name
    if 'read' in name:
        name2 = name.replace('vggish/', '').replace('/read', '').replace(
            'conv3/', '').replace('conv4/', '').replace('/fc1', '')
        name2_layer, name2_type = name2.split('/')
        if name2_type == 'biases':
            weights[name2_layer].append(sess.run(op.values())[0])

model = get_vggish_keras()
for layer in model.layers:
    if layer.name in list(weights.keys()):
        layer.set_weights(weights[layer.name])
model.save_weights(checkpoint_file)
Esempio n. 5
0
num_secs = 3
freq = 1000
sr = 44100
t = np.linspace(0, num_secs, int(num_secs * sr))
x = np.sin(2 * np.pi * freq * t)

# Produce a batch of log mel spectrogram examples.
input_batch = vggish_input.waveform_to_examples(x, sr)
print('Log Mel Spectrogram example: ', input_batch[0])
np.testing.assert_equal(
    input_batch.shape,
    [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS])

# Define VGGish, load the checkpoint, and run the batch through the model to
# produce embeddings.
model = vggish_keras.get_vggish_keras()
model.load_weights(checkpoint_path)
embedding_batch = model.predict(input_batch[:,:,:,None])
print('VGGish embedding: ', embedding_batch[0])
expected_embedding_mean = 0.131
expected_embedding_std = 0.238
np.testing.assert_allclose(
  [np.mean(embedding_batch), np.std(embedding_batch)],
  [expected_embedding_mean, expected_embedding_std],
  rtol=rel_error)

# Postprocess the results to produce whitened quantized embeddings.
pproc = vggish_postprocess.Postprocessor(pca_params_path)
postprocessed_batch = pproc.postprocess(embedding_batch)
print('Postprocessed VGGish embedding: ', postprocessed_batch[0])
expected_postprocessed_mean = 123.0
Esempio n. 6
0
def nonpretrained_vggish_volumetric_context_FPN(
        img_shape=(20, 96, 64, 1), pretrained=True):
    def get_crop_shape(target, refer):
        # width, the 3rd dimension
        cw = (target.get_shape()[2] - refer.get_shape()[2]).value
        assert (cw >= 0)
        if cw % 2 != 0:
            cw1, cw2 = int(cw / 2), int(cw / 2) + 1
        else:
            cw1, cw2 = int(cw / 2), int(cw / 2)
        # height, the 2nd dimension
        ch = (target.get_shape()[1] - refer.get_shape()[1]).value
        assert (ch >= 0)
        if ch % 2 != 0:
            ch1, ch2 = int(ch / 2), int(ch / 2) + 1
        else:
            ch1, ch2 = int(ch / 2), int(ch / 2)

        return (ch1, ch2), (cw1, cw2)

    def _upsample_add(x, y, crop=0):
        #print(x.shape, y.shape)
        out = UpSampling2D(size=(2, 2), interpolation='bilinear')(x)
        if crop == 1:
            ch, cw = get_crop_shape(out, y)
            out = Cropping2D(cropping=(ch, cw))(out)

        return Add()([out, y])

    base_model = get_vggish_keras()

    layer1 = base_model.layers[3].output
    layer2 = base_model.layers[6].output
    layer3 = base_model.layers[9].output
    layer4 = base_model.get_output_at(-1)

    smooth1 = Conv2D(128, kernel_size=3, strides=1, padding="same")
    smooth2 = Conv2D(128, kernel_size=3, strides=1, padding="same")

    # Lateral layers
    toplayer = Conv2D(128, kernel_size=1, strides=1)  #(layer4)
    latlayer1 = Conv2D(128, kernel_size=1, strides=1)  #(layer3)
    latlayer2 = Conv2D(128, kernel_size=1, strides=1)  #(layer2)

    p5 = toplayer(layer3)
    p4 = _upsample_add(p5, latlayer1(layer2), crop=1)
    p4 = smooth1(p4)
    p3 = _upsample_add(p4, latlayer2(layer1))
    p3 = smooth2(p3)

    z = concatenate([
        layer4,
        GlobalAveragePooling2D()(p3),
        GlobalAveragePooling2D()(p4),
        GlobalAveragePooling2D()(p5)
    ])
    singleframe_model = Model(inputs=base_model.input, outputs=z)

    sequence = Input(shape=img_shape, dtype='float32')
    x = TimeDistributed(singleframe_model)(sequence)
    x = LSTM(512)(x)

    x = Dense(6 * 7 * 6 * 1024, activation='elu')(x)
    y = Reshape((6, 7, 6, 1024))(x)

    y = Conv3DTranspose(512, (3, 3, 3), (2, 2, 2), activation='elu')(y)
    y = Conv3DTranspose(256, (3, 3, 3), (2, 2, 2), activation='elu')(y)
    y = Conv3DTranspose(128, (3, 3, 3), (2, 2, 2), activation='elu')(y)
    out = Conv3DTranspose(1, (3, 3, 3), (2, 2, 2), activation='elu')(y)
    model = Model(inputs=sequence, outputs=out)

    return model