def build_models(self, image): with tf.variable_scope('img_discriminator'): nf = 64 self.img = image # size 64, 64, 3 # layer 1 self.l1 = op.conv2d(self.img, nf, name='l1') self.l1 = op.lrelu(self.l1) # self.l1_bn = op.batch_norm(name='l1_bn0') # self.l1 = op.lrelu(self.l1_bn(self.l1, train=self.train)) # layer 2 self.l2 = op.conv2d(self.l1, nf * 2, name='l2') self.l2_bn = op.batch_norm(name='l2_bn0') self.l2 = op.lrelu(self.l2_bn(self.l2, train=self.train)) # layer 3 self.l3 = op.conv2d(self.l2, nf * 4, name='l3') self.l3_bn = op.batch_norm(name='l3_bn0') self.l3 = op.lrelu(self.l3_bn(self.l3, train=self.train)) # layer 4 self.l4 = op.conv2d(self.l3, nf * 8, name='l4') self.l4_bn = op.batch_norm(name='l4_bn0') self.l4 = op.lrelu(self.l4_bn(self.l4, train=self.train)) # layer 6, actually it is different from the original paper.. self.score = op.linear(tf.reshape(self.l4, [self.batch_size, -1]), 1, 'final') return
def discriminator(self, image, is_training, reuse=False): with tf.variable_scope("discriminator"): if reuse: tf.get_variable_scope().reuse_variables() # [batch,256,256,1] -> [batch,128,128,64] h0 = lrelu(conv2d(image, self.discriminator_dim, scope="d_h0_conv")) # [batch,128,128,64] -> [batch,64,64,64*2] h1 = lrelu( batch_norm(conv2d(h0, self.discriminator_dim * 2, scope="d_h1_conv"), is_training, scope="d_bn_1")) # [batch,64,64,64*2] -> [batch,32,32,64*4] h2 = lrelu( batch_norm(conv2d(h1, self.discriminator_dim * 4, scope="d_h2_conv"), is_training, scope="d_bn_2")) # [batch,32,32,64*4] -> [batch,31,31,64*8] h3 = lrelu( batch_norm(conv2d(h2, self.discriminator_dim * 8, sh=1, sw=1, scope="d_h3_conv"), is_training, scope="d_bn_3")) # real or fake binary loss fc1 = fc(tf.reshape(h3, [self.batch_size, -1]), 1, scope="d_fc1") return tf.sigmoid(fc1), fc1
def discriminator(self, image, is_training, reuse=False): with tf.variable_scope("discriminator"): if reuse: tf.get_variable_scope().reuse_variables() h0 = lrelu(conv2d(image, self.discriminator_dim, scope="d_h0_conv")) h1 = lrelu( batch_norm(conv2d(h0, self.discriminator_dim * 2, scope="d_h1_conv"), is_training, scope="d_bn_1")) h2 = lrelu( batch_norm(conv2d(h1, self.discriminator_dim * 4, scope="d_h2_conv"), is_training, scope="d_bn_2")) h3 = lrelu( batch_norm(conv2d(h2, self.discriminator_dim * 8, scope="d_h3_conv"), is_training, scope="d_bn_3")) # h4 = lrelu(batch_norm(conv2d(h3, self.discriminator_dim * 8, scope="d_h4_conv"), # is_training, scope="d_bn_4")) # h5 = lrelu(batch_norm(conv2d(h4, self.discriminator_dim * 8, sh=1, sw=1, scope="d_h5_conv"), # is_training, scope="d_bn_5")) # real or fake binary loss fc1 = fc(tf.reshape(h3, [self.batch_size, -1]), 8, scope="d_fc1") fc2 = fc(fc1, 1, scope="d_fc2") return tf.nn.sigmoid(fc2), fc2
def encode_layer(x, output_filters, layer): act = lrelu(x) conv = conv2d(act, output_filters=output_filters, scope="d_e%d_conv" % layer) enc = batch_norm(conv, is_training, scope="d_e%d_bn" % layer) return enc
def build_image_generator(self, img_z, sen_rep): with tf.variable_scope('img_generator'): # now, calculate the size of output during the deconv upsampling # note that we only use stride 2 during the conv assert self.config.generator_l1_nchannel % 8 == 0, \ logger.error('[ERROR] Invalid channel size') l5_h, l5_w, l5_c = 64, 64, 3 l4_h, l4_w, l4_c = 32, 32, 64 l3_h, l3_w, l3_c = 16, 16, 128 l2_h, l2_w, l2_c = 8, 8, 256 l1_h, l1_w, l1_c = 4, 4, 512 # construct the network layer by layer # layer 0: combines the conditional vec with the noise vec sen_rep = op.linear(sen_rep, 128, 'conditional_vec') self.l0 = tf.concat(1, [img_z, op.lrelu(sen_rep)]) # layer 1: the linear projection self.l1 = op.linear(self.l0, l1_w * l1_h * l1_c, 'l0_lin') self.l1 = tf.reshape(self.l1, [self.batch_size, l1_h, l1_w, l1_c]) self.l1_bn = op.batch_norm(name='l1_bn0') self.l1 = tf.nn.relu(self.l1_bn(self.l1, train=self.train)) # layer 2: first conv1 self.l2 = op.deconv2d(self.l1, [self.batch_size, l2_h, l2_w, l2_c], name='l2') self.l2_bn = op.batch_norm(name='l2_bn0') self.l2 = tf.nn.relu(self.l2_bn(self.l2, train=self.train)) # layer 3: conv2 self.l3 = op.deconv2d(self.l2, [self.batch_size, l3_h, l3_w, l3_c], name='l3') self.l3_bn = op.batch_norm(name='l3_bn0') self.l3 = tf.nn.relu(self.l3_bn(self.l3, train=self.train)) # layer 4: conv4 self.l4 = op.deconv2d(self.l3, [self.batch_size, l4_h, l4_w, l4_c], name='l4') self.l4_bn = op.batch_norm(name='l4_bn0') self.l4 = tf.nn.relu(self.l4_bn(self.l4, train=self.train)) # layer 5: conv5 / final self.l5 = op.deconv2d(self.l4, [self.batch_size, l5_h, l5_w, l5_c], name='l5') self.fake_img = tf.nn.tanh(self.l5) # [-1, 1] img_shape = self.fake_img.get_shape() # check the size of the image assert (img_shape[1] == 64) and \ (img_shape[2] == 64) and (img_shape[3] == 3), \ logger.error('Wrong fake image dimension: {}'.format(img_shape)) return
def build_models(self, image, sentence_vec): with tf.variable_scope('img_discriminator'): self.img = image # size 64, 64, 3 self.sentence_vec = op.lrelu( op.linear(sentence_vec, 128, 'conditional_vec')) # size [batch, 128] # set the size of each layer first, we have four conv layer l1_h, l1_w, l1_c = 32, 32, 64 l2_h, l2_w, l2_c = 16, 16, 128 l3_h, l3_w, l3_c = 8, 8, 256 l4_h, l4_w, l4_c = 4, 4, 512 # layer 1 self.l1 = op.conv2d(self.img, l1_c, name='l1') self.l1 = op.lrelu(self.l1) # self.l1_bn = op.batch_norm(name='l1_bn0') # self.l1 = op.lrelu(self.l1_bn(self.l1, train=self.train)) # layer 2 self.l2 = op.conv2d(self.l1, l2_c, name='l2') self.l2_bn = op.batch_norm(name='l2_bn0') self.l2 = op.lrelu(self.l2_bn(self.l2, train=self.train)) # layer 3 self.l3 = op.conv2d(self.l2, l3_c, name='l3') self.l3_bn = op.batch_norm(name='l3_bn0') self.l3 = op.lrelu(self.l3_bn(self.l3, train=self.train)) # layer 4 self.l4 = op.conv2d(self.l3, l4_c, name='l4') self.l4_bn = op.batch_norm(name='l4_bn0') self.l4 = op.lrelu(self.l4_bn(self.l4, train=self.train)) # now self.l4 is size 4, 4, 512, we try to connect the text info self.sentence_vec = tf.expand_dims(self.sentence_vec, 1) self.sentence_vec = tf.expand_dims(self.sentence_vec, 2) # batch, 1, 1, 128 to batch, 4, 4, 128 self.sentence_vec = tf.tile(self.sentence_vec, [1, 4, 4, 1]) self.l4 = tf.concat(3, [self.l4, self.sentence_vec]) # layer 5 self.l5 = op.conv2d(self.l4, l4_c, 1, 1, 1, 1, name='l5') self.l5_bn = op.batch_norm(name='l5_bn0') self.l5 = op.lrelu(self.l5_bn(self.l5, train=self.train)) # layer 6, actually it is different from the original paper.. self.score = op.linear(tf.reshape(self.l5, [self.batch_size, -1]), 1, 'final') return
def build_sentence_generator(self): ''' @brief: it is actually very tricky... not sure how we gonna generate the text. ''' with tf.variable_scope('sen_generator'): # the conditional vector self.l0 = tf.concat(1, [self.sen_z, op.lrelu(self.img_rep)]) # layer 1, transform from the raw state to the initial state self.l1, self.h0_w, self.h0_b = op.linear( self.l0, self.config.text_gen_hidden_dim, 'l0_lin', with_w=True) self.l1 = tf.reshape( self.l1, [self.batch_size, self.config.text_gen_hidden_dim]) self.l1_bn = op.batch_norm(name='l1_bn0') self.l1 = tf.nn.relu(self.l1_bn(self.l1, train=self.train)) # layer 2, the rnn part cell = tf.nn.rnn_cell.GRUCell(self.config.text_gen_hidden_dim) # define the vocabulary matrix here, note that there's a diff # between the embedding matrix and the vocabulary matrix self.vocabulary_mat = tf.get_variable( 'voc_mat', initializer=tf.random_normal([ self.config.text_gen_hidden_dim, self.config.word_embedding_space_size ])) self.vocabulary_mat_trans = tf.transpose(self.vocabulary_mat) self.vocabulary_bias = tf.get_variable( 'voc_bias', initializer=tf.random_normal([self.config.text_gen_hidden_dim ])) # it's tricky when it comes to teacher forcing. if self.teacher_forcing: # the loop function to be called at each time step loop = tf.nn.seq2seq._extract_argmax_and_embed( self.word_embedding, output_projection=(self.vocabulary_mat_trans, self.vocabulary_bias), update_embedding=False) else: loop = None self.teacher_forcing_embedding = tf.nn.embedding_lookup( self.word_embedding, self.teacher_forcing) outputs, state = tf.nn.seq2seq.decoder( self.teacher_forcing_embedding, self.l0, cell, loop_function=loop) self.fake_sentence = [ tf.nn.xw_plus_b(x, self.vocabulary_mat_trans, self.vocabulary_bias) for x in outputs ] return 0