def __init__(self, rutaCarac, dic_inf_audios=None): ''' Paramteros ---------- rutaCarac: str ruta donde guardar los archivos de características dic_inf_audios: dict diccionario donde se guardan atributos extra para cada audio. Clave: nombre del audio. Valor: Diccionario con claves el nombre de atributo y valor el valor. ''' self.rutaCcas = '../' + rutaCarac #i.e.: CaracteristicasExtraidas/Vggish/embeddings||espectros/ try: os.mkdir(self.rutaCcas) except FileExistsError: print( 'Directorio de características ya existente, no se crea nuevo.' ) self.dic_inf_audios = dic_inf_audios #Definimos VGGish self.model = vggish_keras.get_vggish_keras() #Cargamos el checkpoint checkpoint_path = 'vggish_weights.ckpt' self.model.load_weights(checkpoint_path)
def nonpretrained_vggish_volumetric(pretrained=True): base_model = get_vggish_keras() layer4 = base_model.get_output_at(-1) x = Dense(6 * 7 * 6 * 1024, activation='elu')(layer4) y = Reshape((6, 7, 6, 1024))(x) y = Conv3DTranspose(512, (3, 3, 3), (2, 2, 2), activation='elu')(y) y = Conv3DTranspose(256, (3, 3, 3), (2, 2, 2), activation='elu')(y) y = Conv3DTranspose(128, (3, 3, 3), (2, 2, 2), activation='elu')(y) y = Conv3DTranspose(1, (3, 3, 3), (2, 2, 2), activation='elu')(y) model = Model(inputs=base_model.input, outputs=y) return model
def nonpretrained_vggish_volumetric_context( img_shape=(20, 96, 64, 1), pretrained=True): base_model = get_vggish_keras() layer4 = base_model.get_output_at(-1) singleframe_model = Model(inputs=base_model.input, outputs=layer4) sequence = Input(shape=img_shape, dtype='float32') x = TimeDistributed(singleframe_model)(sequence) x = LSTM(512)(x) x = Dense(6 * 7 * 6 * 1024, activation='elu')(x) y = Reshape((6, 7, 6, 1024))(x) y = Conv3DTranspose(512, (3, 3, 3), (2, 2, 2), activation='elu')(y) y = Conv3DTranspose(256, (3, 3, 3), (2, 2, 2), activation='elu')(y) y = Conv3DTranspose(128, (3, 3, 3), (2, 2, 2), activation='elu')(y) out = Conv3DTranspose(1, (3, 3, 3), (2, 2, 2), activation='elu')(y) model = Model(inputs=sequence, outputs=out) return model
vggish_params.OUTPUT_TENSOR_NAME) pproc = vggish_postprocess.Postprocessor('vggish_pca_params.npz') weights = {} operations = sess.graph.get_operations() for op in operations: name = op.name if 'read' in name: name2 = name.replace('vggish/', '').replace('/read', '').replace( 'conv3/', '').replace('conv4/', '').replace('/fc1', '') name2_layer, name2_type = name2.split('/') if name2_type == 'weights': weights[name2_layer] = [] weights[name2_layer].append(sess.run(op.values())[0]) for op in operations: name = op.name if 'read' in name: name2 = name.replace('vggish/', '').replace('/read', '').replace( 'conv3/', '').replace('conv4/', '').replace('/fc1', '') name2_layer, name2_type = name2.split('/') if name2_type == 'biases': weights[name2_layer].append(sess.run(op.values())[0]) model = get_vggish_keras() for layer in model.layers: if layer.name in list(weights.keys()): layer.set_weights(weights[layer.name]) model.save_weights(checkpoint_file)
num_secs = 3 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.waveform_to_examples(x, sr) print('Log Mel Spectrogram example: ', input_batch[0]) np.testing.assert_equal( input_batch.shape, [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS]) # Define VGGish, load the checkpoint, and run the batch through the model to # produce embeddings. model = vggish_keras.get_vggish_keras() model.load_weights(checkpoint_path) embedding_batch = model.predict(input_batch[:,:,:,None]) print('VGGish embedding: ', embedding_batch[0]) expected_embedding_mean = 0.131 expected_embedding_std = 0.238 np.testing.assert_allclose( [np.mean(embedding_batch), np.std(embedding_batch)], [expected_embedding_mean, expected_embedding_std], rtol=rel_error) # Postprocess the results to produce whitened quantized embeddings. pproc = vggish_postprocess.Postprocessor(pca_params_path) postprocessed_batch = pproc.postprocess(embedding_batch) print('Postprocessed VGGish embedding: ', postprocessed_batch[0]) expected_postprocessed_mean = 123.0
def nonpretrained_vggish_volumetric_context_FPN( img_shape=(20, 96, 64, 1), pretrained=True): def get_crop_shape(target, refer): # width, the 3rd dimension cw = (target.get_shape()[2] - refer.get_shape()[2]).value assert (cw >= 0) if cw % 2 != 0: cw1, cw2 = int(cw / 2), int(cw / 2) + 1 else: cw1, cw2 = int(cw / 2), int(cw / 2) # height, the 2nd dimension ch = (target.get_shape()[1] - refer.get_shape()[1]).value assert (ch >= 0) if ch % 2 != 0: ch1, ch2 = int(ch / 2), int(ch / 2) + 1 else: ch1, ch2 = int(ch / 2), int(ch / 2) return (ch1, ch2), (cw1, cw2) def _upsample_add(x, y, crop=0): #print(x.shape, y.shape) out = UpSampling2D(size=(2, 2), interpolation='bilinear')(x) if crop == 1: ch, cw = get_crop_shape(out, y) out = Cropping2D(cropping=(ch, cw))(out) return Add()([out, y]) base_model = get_vggish_keras() layer1 = base_model.layers[3].output layer2 = base_model.layers[6].output layer3 = base_model.layers[9].output layer4 = base_model.get_output_at(-1) smooth1 = Conv2D(128, kernel_size=3, strides=1, padding="same") smooth2 = Conv2D(128, kernel_size=3, strides=1, padding="same") # Lateral layers toplayer = Conv2D(128, kernel_size=1, strides=1) #(layer4) latlayer1 = Conv2D(128, kernel_size=1, strides=1) #(layer3) latlayer2 = Conv2D(128, kernel_size=1, strides=1) #(layer2) p5 = toplayer(layer3) p4 = _upsample_add(p5, latlayer1(layer2), crop=1) p4 = smooth1(p4) p3 = _upsample_add(p4, latlayer2(layer1)) p3 = smooth2(p3) z = concatenate([ layer4, GlobalAveragePooling2D()(p3), GlobalAveragePooling2D()(p4), GlobalAveragePooling2D()(p5) ]) singleframe_model = Model(inputs=base_model.input, outputs=z) sequence = Input(shape=img_shape, dtype='float32') x = TimeDistributed(singleframe_model)(sequence) x = LSTM(512)(x) x = Dense(6 * 7 * 6 * 1024, activation='elu')(x) y = Reshape((6, 7, 6, 1024))(x) y = Conv3DTranspose(512, (3, 3, 3), (2, 2, 2), activation='elu')(y) y = Conv3DTranspose(256, (3, 3, 3), (2, 2, 2), activation='elu')(y) y = Conv3DTranspose(128, (3, 3, 3), (2, 2, 2), activation='elu')(y) out = Conv3DTranspose(1, (3, 3, 3), (2, 2, 2), activation='elu')(y) model = Model(inputs=sequence, outputs=out) return model