def __init__(self, dim_image, n_words, dim_hidden, batch_size, n_caption_steps,
        n_video_steps, drop_out_rate, bias_init_vector=None):
        self.dim_image = dim_image
        self.n_words = n_words
        self.dim_hidden = dim_hidden
        self.batch_size = batch_size
        self.n_caption_steps = n_caption_steps
        self.drop_out_rate = drop_out_rate
        self.n_video_steps = n_video_steps

        with tf.device(cpu_device):
            self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_hidden], -0.1, 0.1), name='Wemb')

        # decoding LSTM for sentence
        self.lstm3 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes=True, state_is_tuple=True)
        # decoding LSTM for video
        self.lstm4 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes=True, state_is_tuple=True)

        self.lstm3_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm3,output_keep_prob=1 - self.drop_out_rate)
        self.lstm4_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm4,output_keep_prob=1 - self.drop_out_rate)

        self.vae = VAE(self.dim_hidden * 2, self.dim_hidden)

        self.encode_image_W = tf.Variable(tf.random_uniform([dim_video_feat, dim_hidden], -0.026, 0.026),name='encode_image_W')
        self.encode_image_b = tf.Variable(tf.zeros([dim_hidden]), name='encode_image_b')
        self.decode_image_W = tf.Variable(tf.random_uniform([dim_hidden, dim_image], -0.028, 0.028), name='decode_image_W')
        self.decode_image_b = tf.Variable(tf.zeros([dim_image]), name='decode_image_b')

        self.embed_word_W = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1,0.1), name='embed_word_W')
        self.sent_emb = tf.Variable(tf.random_uniform([n_words, dim_hidden], -0.1, 0.1), name='sent_emb')
        if bias_init_vector is not None:
            self.embed_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='embed_word_b')
        else:
            self.embed_word_b = tf.Variable(tf.zeros([n_words]), name='embed_word_b')
        self.loc_matrix = tf.Variable(np.identity(n_words), dtype=tf.float32, name='loc_matrix')
Esempio n. 2
0
    def __init__(self,
                 dim_image,
                 n_words,
                 dim_hidden,
                 batch_size,
                 n_caption_steps,
                 n_video_steps,
                 drop_out_rate,
                 bias_init_vector=None):
        self.dim_image = dim_image
        self.n_words = n_words
        self.dim_hidden = dim_hidden
        self.batch_size = batch_size
        self.n_caption_steps = n_caption_steps
        self.drop_out_rate = drop_out_rate
        self.n_video_steps = n_video_steps

        with tf.device("/cpu:0"):
            self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_hidden],
                                                      -0.1, 0.1),
                                    name='Wemb')

        # encoding LSTM for sentence
        self.lstm2 = tf.contrib.rnn.LSTMCell(self.dim_hidden,
                                             use_peepholes=True,
                                             state_is_tuple=True)
        # decoding LSTM for sentence
        self.lstm3 = tf.contrib.rnn.LSTMCell(self.dim_hidden,
                                             use_peepholes=True,
                                             state_is_tuple=True)
        # decoding LSTM for video
        self.lstm4 = tf.contrib.rnn.LSTMCell(self.dim_hidden,
                                             use_peepholes=True,
                                             state_is_tuple=True)

        self.lstm2_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm2,
                                                           output_keep_prob=1 -
                                                           self.drop_out_rate)
        self.lstm3_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm3,
                                                           output_keep_prob=1 -
                                                           self.drop_out_rate)
        self.lstm4_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm4,
                                                           output_keep_prob=1 -
                                                           self.drop_out_rate)

        self.vae = VAE(self.dim_hidden * 2, self.dim_hidden)

        self.encode_image_W = tf.Variable(tf.random_uniform(
            [dim_image, dim_hidden], -0.1, 0.1),
                                          name='encode_image_W')
        self.encode_image_b = tf.Variable(tf.zeros([dim_hidden]),
                                          name='encode_image_b')
        self.decode_image_W = tf.Variable(
            tf.random_uniform([dim_hidden, dim_image],
                              -0.1,
                              0.1,
                              name='decode_image_W'))
        self.decode_image_b = tf.Variable(tf.random_uniform([dim_image]),
                                          name='decode_image_b')
        self.embed_att_w = tf.Variable(tf.random_uniform([dim_hidden, 1], -0.1,
                                                         0.1),
                                       name='embed_att_w')
        self.embed_att_Wa = tf.Variable(tf.random_uniform(
            [dim_hidden, dim_hidden], -0.1, 0.1),
                                        name='embed_att_Wa')
        self.embed_att_Ua = tf.Variable(tf.random_uniform(
            [dim_hidden, dim_hidden], -0.1, 0.1),
                                        name='embed_att_Ua')
        self.embed_att_ba = tf.Variable(tf.zeros([dim_hidden]),
                                        name='embed_att_ba')

        self.embed_word_W = tf.Variable(tf.random_uniform(
            [dim_hidden, n_words], -0.1, 0.1),
                                        name='embed_word_W')
        if bias_init_vector is not None:
            self.embed_word_b = tf.Variable(bias_init_vector.astype(
                np.float32),
                                            name='embed_word_b')
        else:
            self.embed_word_b = tf.Variable(tf.zeros([n_words]),
                                            name='embed_word_b')

        self.embed_nn_Wp = tf.Variable(tf.random_uniform(
            [3 * dim_hidden, dim_hidden], -0.1, 0.1),
                                       name='embed_nn_Wp')
        self.embed_nn_bp = tf.Variable(tf.zeros([dim_hidden]),
                                       name='embed_nn_bp')
    def __init__(self,
                 dim_image,
                 n_words,
                 dim_hidden,
                 batch_size,
                 n_caption_steps,
                 n_video_steps,
                 drop_out_rate,
                 bias_init_vector=None):
        self.dim_image = dim_image
        self.n_words = n_words
        self.dim_hidden = dim_hidden
        self.batch_size = batch_size
        self.n_caption_steps = n_caption_steps
        self.drop_out_rate = drop_out_rate
        self.n_video_steps = n_video_steps

        with tf.device("/cpu:0"):
            self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_hidden],
                                                      -0.1, 0.1),
                                    name='Wemb')

        # encoding LSTM for video
        self.lstm1 = tf.contrib.rnn.LSTMCell(self.dim_hidden,
                                             use_peepholes=True,
                                             state_is_tuple=True)
        # encoding LSTM for sentence
        self.lstm2 = tf.contrib.rnn.LSTMCell(self.dim_hidden,
                                             use_peepholes=True,
                                             state_is_tuple=True)
        # decoding LSTM for sentence
        self.lstm3 = tf.contrib.rnn.LSTMCell(self.dim_hidden,
                                             use_peepholes=True,
                                             state_is_tuple=True)
        # decoding LSTM for video
        self.lstm4 = tf.contrib.rnn.LSTMCell(self.dim_hidden,
                                             use_peepholes=True,
                                             state_is_tuple=True)

        self.lstm1_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm1,
                                                           output_keep_prob=1 -
                                                           self.drop_out_rate)
        self.lstm2_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm2,
                                                           output_keep_prob=1 -
                                                           self.drop_out_rate)
        self.lstm3_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm3,
                                                           output_keep_prob=1 -
                                                           self.drop_out_rate)
        self.lstm4_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm4,
                                                           output_keep_prob=1 -
                                                           self.drop_out_rate)

        self.vae = VAE(self.dim_hidden * 2, self.dim_hidden)

        self.encode_image_W = tf.Variable(tf.random_uniform(
            [dim_video_feat, dim_hidden], -0.1, 0.1),
                                          name='encode_image_W')
        self.encode_image_b = tf.Variable(tf.zeros([dim_hidden]),
                                          name='encode_image_b')
        self.decode_image_W = tf.Variable(
            tf.random_uniform([dim_hidden, dim_image],
                              -0.1,
                              0.1,
                              name='decode_image_W'))
        self.decode_image_b = tf.Variable(tf.random_uniform([dim_image]),
                                          name='decode_image_b')

        self.embed_word_W = tf.Variable(tf.random_uniform(
            [dim_hidden, n_words], -0.1, 0.1),
                                        name='embed_word_W')
        if bias_init_vector is not None:
            self.embed_word_b = tf.Variable(bias_init_vector.astype(
                np.float32),
                                            name='embed_word_b')
        else:
            self.embed_word_b = tf.Variable(tf.zeros([n_words]),
                                            name='embed_word_b')

        # attribute embedding
        self.embed_att_w = tf.Variable(tf.random_uniform([dim_att, dim_hidden],
                                                         -0.1, 0.1),
                                       name='embed_att_w')
        self.embed_att_b = tf.Variable(tf.zeros([dim_hidden]),
                                       name='embed_att_b')

        # learnable coefficient for normalized video and sentence feature
        self.video_coeff = tf.Variable(tf.ones([1]), name='video_coeff')
        self.sent_coeff = tf.Variable(tf.ones([1]), name='sent_coeff')
        self.h2h_w = tf.Variable(tf.random_uniform([dim_hidden, dim_hidden],
                                                   -0.1, 0.1),
                                 name='h2h_w')
        self.h2h_b = tf.Variable(tf.zeros([dim_hidden]), name='h2h_b')
        self.h2c_w = tf.Variable(tf.random_uniform([dim_hidden, dim_hidden],
                                                   -0.1, 0.1),
                                 name='h2c_w')
        self.h2c_b = tf.Variable(tf.zeros([dim_hidden]), name='h2c_b')