def build_img_G(self): audio_embeddings = Input(batch_shape=(self.batch_size, self.audio_emb_dim)) audio_noise = Input(batch_shape=(self.batch_size, self.noise_dim)) #audio_label_embedding = ()(Embedding(self.classes, self.audio_emb_dim)(audio_labels)) audio_input = Concatenate()([audio_embeddings, audio_noise]) x = Dense(256)(audio_input) x = LeakyReLU(alpha=0.2)(x) x = BatchNormalization(momentum=0.8)(x) x = Dense(512)(x) x = LeakyReLU(alpha=0.2)(x) x = BatchNormalization(momentum=0.8)(x) x = Dense(1024)(x) x = LeakyReLU(alpha=0.2)(x) x = BatchNormalization(momentum=0.8)(x) x = Dense(np.prod(self.img_shape), activation='sigmoid')(x) img = Reshape(self.img_shape, name='generated_img')(x) encoded_audio = Encoding_layer()(img) img_G = Model(inputs=[audio_embeddings, audio_noise], outputs=encoded_audio) return img_G
def build_img_G_conv_full(self): audio_embeddings = Input(batch_shape=(self.batch_size, self.audio_emb_dim)) audio_noise = Input(batch_shape=(self.batch_size, self.noise_dim)) audio_input = Concatenate()([audio_embeddings, audio_noise]) x = Dense(128*7*7, activation='relu')(audio_input) x = Reshape((7,7,128))(x) x = UpSampling2D()(x) x = Conv2D(128, kernel_size=5, padding="same")(x) x = BatchNormalization(momentum=0.8)(x) x = Activation("relu")(x) x = UpSampling2D()(x) x = Conv2D(64, kernel_size=5, padding="same")(x) x = BatchNormalization(momentum=0.8)(x) x = Activation("relu")(x) #x = UpSampling2D()(x) #x = Conv2D(32, kernel_size=5, padding="same")(x) #x = BatchNormalization(momentum=0.8)(x) #x = Activation("relu")(x) x = Conv2D(self.channels, kernel_size=3, padding="same")(x) img = Activation(activation='sigmoid', name='generated_img')(x) encoded_audio = Encoding_layer()(img) img_G = Model(inputs=[audio_embeddings, audio_noise], outputs=encoded_audio) return img_G
def build_audio_encoder(self): img_input = Input(batch_shape=(self.batch_size, self.img_rows, self.img_cols, self.channels), name='img_input') output = Encoding_layer(name='vOICe')(img_input) audio_encoder = Model(inputs=img_input, outputs=output) return audio_encoder
def build_audio_C(self): img_input = Input(batch_shape=(self.batch_size, self.img_rows, self.img_cols, self.channels), name='img_input') x = Encoding_layer(name='vOICe')(img_input) spectro = logMelSpectrogram(name='logSpectrogram')(x) # Block 1 x = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv1')(spectro) #x = LeakyReLU(alpha=0.2)(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool1')(x) # Block 2 x = Conv2D(128, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv2')(x) #x = LeakyReLU(alpha=0.2)(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool2')(x) # Block 3 x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_1')(x) #x = LeakyReLU(alpha=0.2)(x) x = Conv2D(256, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv3/conv3_2')(x) #x = LeakyReLU(alpha=0.2)(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool3')(x) # Block 4 x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_1')(x) #x = LeakyReLU(alpha=0.2)(x) x = Conv2D(512, (3, 3), strides=(1, 1), activation='relu', padding='same', name='conv4/conv4_2')(x) #x = LeakyReLU(alpha=0.2)(x) x = MaxPooling2D((2, 2), strides=(2, 2), padding='same', name='pool4')(x) x = Flatten(name='flatten_')(x) x = Dense(4096, activation='relu', name='fc1')(x) embeddings = Dense(self.audio_emb_dim, activation='relu', name='embeddings')(x) predicts = Dense(self.classes, activation='softmax', name='prediction')(embeddings) audio_model = Model(inputs=img_input, outputs=predicts) return audio_model