Beispiel #1
0
 def q_y_x(self, captions, lengths, n_classes):
     """
     Returns:
         x_logits: classifier unnormalized log probabilities
     """
     with tf.variable_scope("net"):
         with tf.device("/cpu:0"):
             embedding = tf.get_variable("dec_embeddings",
                                         [self.vocab_size, self.embed_size],
                                         dtype=tf.float32)
             vect_inputs = tf.nn.embedding_lookup(embedding, captions)
         keep_prob = tf.placeholder_with_default(1.0, (),
                                                 name='classifier_drop')
         cell_0 = make_rnn_cell([self.lstm_hidden],
                                base_cell=tf.contrib.rnn.LSTMCell,
                                dropout_keep_prob=keep_prob)
         zero_state0 = cell_0.zero_state(batch_size=tf.shape(
             self.images_fv)[0],
                                         dtype=tf.float32)
         initial_state = zero_state0
         # _, initial_state = cell_0(self.images_fv, zero_state0)
         # captions LSTM
         outputs, final_state = tf.nn.dynamic_rnn(
             cell_0,
             inputs=vect_inputs,
             sequence_length=lengths,
             initial_state=initial_state,
             swap_memory=True,
             dtype=tf.float32)
         y_logits = tf.layers.dense(final_state[0][1], n_classes)
     return y_logits
Beispiel #2
0
 def decoder(self, gen_mode=False):
     """
     Args:
         gen_mode: set True, will be used for caption generator
     Returns:
         x_logits: mapping to vocabulary, for training
         states: tuple (initial_state, final_state, sample), for generation
     """
     # encoder and decoder have different embeddings but the same image features
     with tf.variable_scope("net") as scope:
         with tf.device("/cpu:0"):
             embedding = tf.get_variable(
                     "dec_embeddings", [self.params.vocab_size,
                                        self.params.embed_size],
                     dtype=tf.float32)
             vect_inputs = tf.nn.embedding_lookup(embedding,
                                                  self.captions)
         dec_lstm_drop = self.params.dec_lstm_drop
         if gen_mode:
             dec_lstm_drop = 1.0
         cell_0 = make_rnn_cell(
             [self.params.decoder_hidden for _ in range(
                 self.params.decoder_rnn_layers)],
             base_cell=tf.contrib.rnn.LSTMCell,
             dropout_keep_prob=dec_lstm_drop)
         zero_state0 = cell_0.zero_state(
             batch_size=tf.shape(self.images_fv)[0],
             dtype=tf.float32)
         # run this cell to get initial state
         _, initial_state0 = cell_0(self.images_fv, zero_state0)
         if self.c_i is not None and self.params.use_c_v:
             _, initial_state0 = cell_0(self.c_i, initial_state0)
         initial_state = rnn_placeholders(initial_state0)
         # captions LSTM
         outputs, final_state = tf.nn.dynamic_rnn(cell_0,
                                                  inputs=vect_inputs,
                                                  sequence_length=self.lengths,
                                                  initial_state=initial_state,
                                                  swap_memory=True,
                                                  dtype=tf.float32)
     # output shape [batch_size, seq_length, self.params.decoder_hidden]
     if gen_mode:
         # only interested in the last output
         outputs = outputs[:, -1, :]
     outputs_r = tf.reshape(outputs, [-1, cell_0.output_size])
     x_logits = tf.layers.dense(outputs_r,
                                units=self.data_dict.vocab_size,
                                name='rnn_logits')
     # for generating
     sample = None
     if gen_mode:
         if self.params.sample_gen == 'sample':
             sample = tf.multinomial(
                 x_logits / self.params.temperature, 1)[0][0]
         elif self.params.sample_gen == 'beam_search':
             sample = tf.nn.softmax(x_logits)
         else:
             sample = tf.nn.softmax(x_logits)
     return x_logits, (initial_state, final_state, sample)
Beispiel #3
0
 def q_z_xy(self, captions, labels, lengths, images=None):
     """Calculate approximate posterior q(z|x, y, f(I))
     Returns:
         model: zhusuan model object, can be used for getting probabilities
     """
     if images is not None:
         self.images_fv = images
     with zs.BayesianNet() as model:
         # encoder and decoder have different embeddings but the same image features
         with tf.device("/cpu:0"):
             embedding = tf.get_variable(
                         "enc_embeddings", [self.vocab_size,
                                            self.embed_size],
                         dtype=tf.float32)
             vect_inputs = tf.nn.embedding_lookup(embedding, captions)
         with tf.name_scope(name="net") as scope1:
             cell_0 = make_rnn_cell(
                 [self.lstm_hidden],
                 base_cell=tf.contrib.rnn.LSTMCell)
             zero_state0 = cell_0.zero_state(
                 batch_size=tf.shape(self.images_fv)[0],
                 dtype=tf.float32)
             # run this cell to get initial state
             added_shape = self.embed_size + self.params.n_classes
             im_f = tf.layers.dense(self.images_fv, added_shape)
             _, initial_state0 = cell_0(im_f, zero_state0)
             # c = h = tf.layers.dense(self.images_fv,
             #                         self.params.decoder_hidden,
             #                         name='dec_init_map')
             # initial_state0 = (tf.nn.rnn_cell.LSTMStateTuple(c, h), )
             # x, y
             y = tf.tile(tf.expand_dims(labels, 1),
                         [1, tf.shape(vect_inputs)[1], 1])
             vect_inputs = tf.concat([vect_inputs, tf.to_float(y)], 2)
             outputs, final_state = tf.nn.dynamic_rnn(cell_0,
                                                      inputs=vect_inputs,
                                                      sequence_length=lengths,
                                                      initial_state=initial_state0,
                                                      swap_memory=True,
                                                      dtype=tf.float32,
                                                      scope=scope1)
         # [batch_size, 2 * lstm_hidden_size]
         # final_state = ((c, h), )
         final_state = final_state[0][1]
         lz_mean = layers.dense(inputs=final_state,
                                units=self.latent_size,
                                activation=None)
         lz_logstd = layers.dense(inputs=final_state,
                                  units=self.latent_size,
                                  activation=None)
         lz_std = tf.exp(lz_logstd)
         # define latent variable`s Stochastic Tensor
         # add mu_k, sigma_k, CVAe ag-cvae
         tm_list = []  # means
         tl_list = []  # log standard deviations
         z = zs.Normal('z', mean=lz_mean, std=lz_std, group_ndims=1,
                       n_samples=self.z_samples)
     return model, tm_list, tl_list
Beispiel #4
0
    def q_net(self):
        """Calculate approximate posterior q(z|x, f(I))
        Returns:
            model: zhusuan model object, can be used for getting probabilities
        """
        with zs.BayesianNet() as model:
            # encoder and decoder have different embeddings but the same image features
            with tf.device("/cpu:0"):
                embedding = tf.get_variable(
                    "enc_embeddings",
                    [self.params.vocab_size, self.params.embed_size],
                    dtype=tf.float32)
                vect_inputs = tf.nn.embedding_lookup(embedding, self.captions)
            with tf.name_scope(name="encoder0") as scope1:
                cell_0 = make_rnn_cell([
                    self.params.encoder_hidden
                    for _ in range(self.params.encoder_rnn_layers)
                ],
                                       base_cell=tf.contrib.rnn.LSTMCell)
                zero_state0 = cell_0.zero_state(batch_size=tf.shape(
                    self.images_fv)[0],
                                                dtype=tf.float32)
                # run this cell to get initial state
                _, initial_state0 = cell_0(self.images_fv, zero_state0)
                if self.c_i != None and self.params.use_c_v:
                    _, initial_state0 = cell_0(self.c_i, initial_state0)
                outputs, final_state = tf.nn.dynamic_rnn(
                    cell_0,
                    inputs=vect_inputs,
                    sequence_length=self.lengths,
                    initial_state=initial_state0,
                    swap_memory=True,
                    dtype=tf.float32,
                    scope=scope1)
            # [batch_size, 2 * lstm_hidden_size]
            # final_state = ((c, h), )
            final_state = tf.concat(values=final_state[0],
                                    axis=1,
                                    name="encoder_hidden")
            if self.params.prior == 'Normal':
                lz_mean = layers.dense(inputs=final_state,
                                       units=self.params.latent_size,
                                       activation=None)
                lz_logstd = layers.dense(inputs=final_state,
                                         units=self.params.latent_size,
                                         activation=None)
            # define latent variable`s Stochastic Tensor
            # add mu_k, sigma_k, CVAe ag-cvae
            tm_list = []  # means
            tl_list = []  # variances
            if self.params.prior == 'GMM':
                cluster = tf.squeeze(tf.multinomial(self.c_i_ph, 1))
                indices = tf.squeeze(tf.range(tf.shape(self.c_i_ph)[0]))
                cluster = tf.stack([indices, tf.cast(cluster, tf.int32)], 1)
                for i in range(90):
                    with tf.variable_scope("gmm_ll_{}".format(i)):
                        lz_mean = layers.dense(inputs=final_state,
                                               units=self.params.latent_size)
                        lz_logstd = layers.dense(inputs=final_state,
                                                 units=self.params.latent_size)
                        tm_list.append(tf.expand_dims(lz_mean, 1))
                        tl_list.append(tf.expand_dims(lz_logstd, 1))
                # [batch_size, 90, z_dim]
                tm_list = tf.concat(tm_list, 1)
                tl_list = tf.concat(tl_list, 1)
                lz_mean = tf.gather_nd(tm_list, cluster)
                lz_logstd = tf.gather_nd(tl_list, cluster)

            if self.params.prior == 'AG':
                #clusters = tf.argmax(self.c_i_ph, 1)
                # [batch_size, 150]?
                # ck*N(mu, sigma)
                for i in range(90):
                    with tf.variable_scope("ag_ll_{}".format(i)):
                        lz_mean = layers.dense(inputs=final_state,
                                               units=self.params.latent_size)
                        lz_logstd = layers.dense(inputs=final_state,
                                                 units=self.params.latent_size)
                        tm_list.append(tf.expand_dims(lz_mean, 1))
                        tl_list.append(tf.expand_dims(lz_logstd, 1))
                # [batch_size, 90, 150]
                # ob_vector [batch_size, 90]
                # need [batch_size, 150]
                tm_list = tf.concat(tm_list, 1)
                tl_list = tf.concat(tl_list, 1)
                c_i_exp = tf.expand_dims(self.c_i_ph, 1)
                lz_mean = tf.squeeze(tf.matmul(c_i_exp, tm_list), 1)
                lz_logstd = tf.squeeze(tf.matmul(c_i_exp, tl_list), 1)
                # debug
                #print(lz_mean)
            z = zs.Normal('z',
                          lz_mean,
                          lz_logstd,
                          group_event_ndims=1,
                          n_samples=self.params.gen_z_samples)
        return z, tm_list, tl_list
Beispiel #5
0
 def px_z_fi(self, observed, gen_mode = False):
     """
     Args:
         observed: for q, parametrized by encoder, used during training
     Returns:
         model: zhusuan model object, can be used for getting probabilities
     """
     with zs.BayesianNet(observed) as model:
         z_mean = tf.zeros([tf.shape(self.images_fv)[0],
                            self.params.latent_size])
         z = zs.Normal('z', mean=z_mean, std=self.params.std,
                       group_event_ndims=1,
                       n_samples=self.params.gen_z_samples)
         # encoder and decoder have different embeddings but the same image features
         with tf.variable_scope("net") as scope:
             with tf.device("/cpu:0"):
                 embedding = tf.get_variable(
                         "dec_embeddings", [self.params.vocab_size,
                                            self.params.embed_size],
                         dtype=tf.float32)
                 vect_inputs = tf.nn.embedding_lookup(embedding,
                                                      self.captions)
             # captions dropout
             if self.params.dec_keep_rate < 1 and not gen_mode:
                 vect_inputs = tf.nn.dropout(vect_inputs,
                                             self.params.dec_keep_rate)
             dec_lstm_drop = self.params.dec_lstm_drop
             if gen_mode:
                 dec_lstm_drop = 1.0
             cell_0 = make_rnn_cell(
                 [self.params.decoder_hidden for _ in range(
                     self.params.decoder_rnn_layers)],
                 base_cell=tf.contrib.rnn.LSTMCell,
                 dropout_keep_prob=dec_lstm_drop)
             zero_state0 = cell_0.zero_state(
                 batch_size=tf.shape(self.images_fv)[0],
                 dtype=tf.float32)
             # run this cell to get initial state
             _, initial_state0 = cell_0(self.images_fv, zero_state0)
             if self.c_i != None and self.params.use_c_v:
                 _, initial_state0 = cell_0(self.c_i, initial_state0)
             if self.params.no_encoder:
                 if not gen_mode:
                     print("Not using q(z|x)")
                 initial_state = rnn_placeholders(initial_state0)
             else:
                 # vector z, mapped into embed_dim
                 z = tf.reshape(z, [-1, self.params.latent_size *
                                    self.params.gen_z_samples])
                 z_dec = layers.dense(z, self.params.embed_size,
                                      name='z_rnn')
                 _, z_state = cell_0(z_dec, initial_state0)
                 initial_state = rnn_placeholders(z_state)
             # captions LSTM
             # TODO: correct sequence_length implementation
             outputs, final_state = tf.nn.dynamic_rnn(cell_0,
                                                      inputs=vect_inputs,
                                                      sequence_length=None,
                                                      initial_state=initial_state,
                                                      swap_memory=True,
                                                      dtype=tf.float32)
         # output shape [batch_size, seq_length, self.params.decoder_hidden]
         if gen_mode:
             # only interested in the last output
             outputs = outputs[:, -1, :]
         outputs_r = tf.reshape(outputs, [-1, cell_0.output_size])
         x_logits = tf.layers.dense(outputs_r,
                                    units=self.data_dict.vocab_size,
                                    name='rnn_logits')
         # for debugging
         shpe = (tf.shape(z), tf.shape(outputs_r),
                 tf.shape(outputs))
         # for generating
         sample = None
         if gen_mode:
             if self.params.sample_gen == 'sample':
                 sample = tf.multinomial(
                     x_logits / self.params.temperature, 1)[0][0]
             elif self.params.sample_gen == 'beam_search':
                 sample = tf.nn.softmax(x_logits)
             else:
                 sample = tf.nn.softmax(x_logits)
     return model, x_logits, shpe, (initial_state, final_state, sample)
Beispiel #6
0
 def px_z_fi(self, observed, gen_mode=False):
     """
     Args:
         observed: for q, parametrized by encoder, used during training
     Returns:
         model: zhusuan model object, can be used for getting probabilities
     """
     with zs.BayesianNet(observed) as model:
         if not gen_mode or self.params.prior != 'AG':
             z_mean = tf.zeros([tf.shape(self.images_fv)[0],
                                self.params.latent_size])
         elif self.params.prior == 'AG' and gen_mode:
             # choose clusters (currently dont support batch of images)
             # mean of clusters for the concrete image
             c_indices = tf.where(self.c_i_ph[0] > 0)  # [num_true, indices]
             pred = tf.equal(tf.shape(c_indices)[0], 0)
             def false(): return tf.squeeze(tf.transpose(c_indices))
             def true():
                 # cl_range = tf.range(
                     # tf.cast(tf.shape(self.cap_clusters)[0], tf.int64))
                 # some classes are unused, dont condition on them
                 # 0, 66, 68, 69, 71, 12, 45, 83, 26, 29, 30
                 un_clusters = {0, 66, 68, 69, 71, 12, 45, 83, 26, 29, 30}
                 cl_num = [i for i in range(self.params.num_clusters + 1)
                           if i not in un_clusters]
                 return tf.convert_to_tensor(cl_num, dtype=tf.int64)
             c_indices = tf.cond(pred, true, false)
             # cap_clusers=[num_clusters, num_z]
             means = tf.gather(self.cap_clusters, c_indices, axis=0)
             # if only one cluster (any better way?)
             def false(): return means
             def true(): return tf.expand_dims(means, 0)
             pred = tf.equal(tf.shape(means)[0],
                             tf.shape(self.cap_clusters)[1])
             means = tf.cond(pred, true, false)
             # find mean cluster for current picture
             z_mean = tf.reduce_mean(means, axis=0)
             z_mean = tf.reshape(z_mean, [1,self.params.latent_size])
         z = zs.Normal('z', mean=z_mean, std=self.params.std,
                       group_event_ndims=1,
                       n_samples=self.params.gen_z_samples)
         # encoder and decoder have different embeddings but the same image features
         with tf.variable_scope("net") as scope:
             with tf.device("/cpu:0"):
                 embedding = tf.get_variable(
                         "dec_embeddings", [self.params.vocab_size,
                                            self.params.embed_size],
                         dtype=tf.float32)
                 vect_inputs = tf.nn.embedding_lookup(embedding,
                                                      self.captions)
             # captions dropout
             if self.params.dec_keep_rate < 1 and not gen_mode:
                 vect_inputs = tf.nn.dropout(vect_inputs,
                                             self.params.dec_keep_rate)
             dec_lstm_drop = self.params.dec_lstm_drop
             if gen_mode:
                 dec_lstm_drop = 1.0
             cell_0 = make_rnn_cell(
                 [self.params.decoder_hidden for _ in range(
                     self.params.decoder_rnn_layers)],
                 base_cell=tf.contrib.rnn.LSTMCell,
                 dropout_keep_prob=dec_lstm_drop)
             zero_state0 = cell_0.zero_state(
                 batch_size=tf.shape(self.images_fv)[0],
                 dtype=tf.float32)
             # run this cell to get initial state
             _, initial_state0 = cell_0(self.images_fv, zero_state0)
             if self.c_i is not None and self.params.use_c_v:
                 _, initial_state0 = cell_0(self.c_i, initial_state0)
             if self.params.no_encoder:
                 if not gen_mode:
                     print("Not using q(z|x)")
                 initial_state = rnn_placeholders(initial_state0)
             else:
                 # vector z, mapped into embed_dim
                 z = tf.reshape(z, [-1, self.params.latent_size *
                                    self.params.gen_z_samples])
                 z_dec = layers.dense(z, self.params.embed_size,
                                      name='z_rnn')
                 _, z_state = cell_0(z_dec, initial_state0)
                 initial_state = rnn_placeholders(z_state)
             # captions LSTM
             outputs, final_state = tf.nn.dynamic_rnn(cell_0,
                                                      inputs=vect_inputs,
                                                      sequence_length=self.lengths,
                                                      initial_state=initial_state,
                                                      swap_memory=True,
                                                      dtype=tf.float32)
         # output shape [batch_size, seq_length, self.params.decoder_hidden]
         if gen_mode:
             # only interested in the last output
             outputs = outputs[:, -1, :]
         outputs_r = tf.reshape(outputs, [-1, cell_0.output_size])
         x_logits = tf.layers.dense(outputs_r,
                                    units=self.data_dict.vocab_size,
                                    name='rnn_logits')
         # for debugging
         shpe = (tf.shape(z), tf.shape(outputs_r),
                 tf.shape(outputs))
         # for generating
         sample = None
         if gen_mode:
             if self.params.sample_gen == 'sample':
                 sample = tf.multinomial(
                     x_logits / self.params.temperature, 1)[0][0]
             elif self.params.sample_gen == 'beam_search':
                 sample = tf.nn.softmax(x_logits)
             else:
                 sample = tf.nn.softmax(x_logits)
     return model, x_logits, shpe, (initial_state, final_state, sample)
Beispiel #7
0
 def px_z_y(self,
            observed,
            captions=None,
            lengths=None,
            gen_mode=False,
            n_x=None):
     """
     Args:
         observed: for q, parametrized by encoder, used during training
     Returns:
         model: zhusuan model object, can be used for getting probabilities
     """
     if captions is not None and lengths is not None:
         self.captions = captions
         self.lengths = lengths
     if n_x is None:
         n_x = tf.shape(self.images_fv)[0]
     with zs.BayesianNet(observed) as model:
         z_mean = tf.zeros([n_x, self.params.latent_size])
         z = zs.Normal('z',
                       mean=z_mean,
                       std=self.params.std,
                       group_ndims=1,
                       n_samples=self.params.gen_z_samples)
         tf.summary.histogram("distributions/z", z)
         y_logits = tf.zeros([n_x, self.n_classes])
         y = zs.OnehotCategorical('y',
                                  y_logits,
                                  n_samples=self.params.gen_z_samples)
         with tf.variable_scope("net"):
             embedding = tf.get_variable(
                 "dec_embeddings",
                 [self.data_dict.vocab_size, self.params.embed_size],
                 dtype=tf.float32)
             # word dropout
             before = tf.reshape(self.captions, [-1])
             word_drop_keep = self.params.word_dropout_keep
             if gen_mode:
                 word_drop_keep = 1.0
             captions = tf.nn.dropout(tf.to_float(self.captions),
                                      word_drop_keep)
             after = tf.reshape(tf.to_int32(captions), [-1])
             mask_after = tf.to_int32(tf.not_equal(before, after))
             to_unk = mask_after * self.data_dict.word2idx['<UNK>']
             captions = tf.reshape(tf.add(after, to_unk),
                                   [tf.shape(self.images_fv)[0], -1])
             vect_inputs = tf.nn.embedding_lookup(embedding, captions)
             dec_lstm_drop = self.params.dec_lstm_drop
             if gen_mode:
                 dec_lstm_drop = 1.0
             cell_0 = make_rnn_cell([self.params.decoder_hidden],
                                    base_cell=tf.contrib.rnn.LSTMCell,
                                    dropout_keep_prob=dec_lstm_drop)
             # zero_state0 = cell_0.zero_state(
             #     batch_size=tf.shape(self.images_fv)[0],
             #     dtype=tf.float32)
             # run this cell to get initial state
             added_shape = self.params.gen_z_samples * self.params.n_classes +\
              self.params.embed_size
             # added_shape = self.params.embed_size
             # f_mapping = tf.layers.dense(self.images_fv, added_shape,
             #                             name='f_emb2')
             c = h = tf.layers.dense(self.images_fv,
                                     self.params.decoder_hidden,
                                     name='dec_init_map')
             initial_state0 = (tf.nn.rnn_cell.LSTMStateTuple(c, h), )
             # vector z, mapped into embed_dim
             z = tf.concat([z, tf.to_float(y)], 2)
             z = tf.reshape(z, [n_x, (self.params.latent_size
                                      + self.n_classes)\
                                * self.params.gen_z_samples])
             z_dec = layers.dense(z, added_shape, name='z_rnn')
             _, z_state = cell_0(z_dec, initial_state0)
             initial_state = rnn_placeholders(z_state)
             # concat with inputs
             y_re = tf.to_float(
                 tf.reshape(y, [
                     tf.shape(self.images_fv)[0],
                     self.params.gen_z_samples * self.params.n_classes
                 ]))
             y = tf.tile(tf.expand_dims(y_re, 1),
                         [1, tf.shape(vect_inputs)[1], 1])
             vect_inputs = tf.concat([vect_inputs, y], 2)
             # vect_inputs = tf.Print(vect_inputs, [tf.shape(vect_inputs)],
             #                        first_n=1)
             # captions LSTM
             outputs, final_state = tf.nn.dynamic_rnn(
                 cell_0,
                 inputs=vect_inputs,
                 sequence_length=self.lengths,
                 initial_state=initial_state,
                 swap_memory=True,
                 dtype=tf.float32)
         # output shape [batch_size, seq_length, self.params.decoder_hidden]
         if gen_mode:
             # only interested in the last output
             outputs = outputs[:, -1, :]
         outputs_r = tf.reshape(outputs, [-1, cell_0.output_size])
         x_logits = tf.layers.dense(outputs_r,
                                    units=self.data_dict.vocab_size,
                                    name='rnn_logits')
         x_logits_r = tf.reshape(
             x_logits, [tf.shape(outputs)[0],
                        tf.shape(outputs)[1], -1])
         x = zs.Categorical('x', x_logits_r, group_ndims=1)
         # for generating
         sample = None
         if gen_mode:
             if self.params.sample_gen == 'sample':
                 sample = tf.multinomial(x_logits / self.params.temperature,
                                         1)[0][0]
             elif self.params.sample_gen == 'beam_search':
                 sample = tf.nn.softmax(x_logits)
             else:
                 sample = tf.nn.softmax(x_logits)
     return model, x_logits, (initial_state, final_state, sample)