Ejemplo n.º 1
0
def lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim):
    # embedding matrix with each row containing the embedding vector of a word
    # this has to be done on CPU currently
    with tf.variable_scope('word_embedding'), tf.device("/cpu:0"):
        embedding_mat = tf.get_variable("embedding", [num_vocab, embed_dim])
        # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
        embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch)

    lstm_top = lstm('lstm_lang', embedded_seq, None, output_dim=lstm_dim,
                    num_layers=1, forget_bias=1.0, apply_dropout=False,
                    concat_output=False)[-1]
    return lstm_top
Ejemplo n.º 2
0
def lstm_encoder(text_seq_batch, name, num_vocab, embed_dim, lstm_dim,
             apply_dropout, reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        embedding_mat = tf.get_variable("embedding_mat", [num_vocab, embed_dim])
        # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
        embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch)

        # Take the output at the final timestep of LSTM.
        lstm_top = lstm("lstm_lang", embedded_seq, None, output_dim=lstm_dim,
                        num_layers=1, forget_bias=1.0,
                        apply_dropout=apply_dropout, concat_output=False)[-1]

    return lstm_top
Ejemplo n.º 3
0
def lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim):
    # embedding matrix with each row containing the embedding vector of a word
    # this has to be done on CPU currently
    with tf.variable_scope('word_embedding'), tf.device("/cpu:0"):
        embedding_mat = tf.get_variable("embedding", [num_vocab, embed_dim])
        # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
        embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch)

    lstm_top = lstm('lstm_lang',
                    embedded_seq,
                    None,
                    output_dim=lstm_dim,
                    num_layers=1,
                    forget_bias=1.0,
                    apply_dropout=False,
                    concat_output=False)[-1]
    return lstm_top
Ejemplo n.º 4
0
def lstm_net_glove(text_seq_batch, embedding, lstm_dim):
    # Initialize embedding layer
    with tf.variable_scope('word_embedding'), tf.device("/cpu:0"):
        embedding_mat = tf.get_variable("embedding",
                                        initializer=embedding,
                                        trainable=False)
        # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
        embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch)

    lstm_top = lstm('lstm_lang',
                    embedded_seq,
                    None,
                    output_dim=lstm_dim,
                    num_layers=1,
                    forget_bias=1.0,
                    apply_dropout=False,
                    concat_output=False)[-1]
    return lstm_top
Ejemplo n.º 5
0
	def model_structure(self, sen_data, vis_data, batch_size, is_train, dropout=None):
		if dropout == None:
			dropout = self.dropout

		text_seq_batch = tf.transpose(sen_data, [1, 0])	# input data is [num_steps, batch_size]
		with tf.variable_scope('word_embedding'), tf.device("/cpu:0"):
			embedding_mat = tf.get_variable("embedding", [self.vocab_size, self.lstm_dim], tf.float32,
				initializer=tf.contrib.layers.xavier_initializer(uniform=True))
			# text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
			embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch)
		# we encode phrase based on the last step of hidden states
		_, states = lstm('lstm_lang', embedded_seq, None, output_dim=self.lstm_dim,
						num_layers=1, forget_bias=1.0, apply_dropout=False,concat_output=False,
						initializer=tf.random_uniform_initializer(minval=-0.08, maxval=0.08))

		# batch normalization for visual and language part
		sen_raw = states[-1].h
		vis_raw = tf.reshape(vis_data, [self.batch_size*self.num_prop, self.img_feat_size])

		sen_bn = bn(sen_raw, is_train, "SEN_BN", 0.9)
		vis_bn = bn(vis_raw, is_train, "VIS_BN", 0.9)

		sen_output = tf.reshape(sen_bn, [self.batch_size, 1, 1, self.lstm_dim])
		vis_output = tf.reshape(vis_bn, [self.batch_size, self.num_prop, 1, self.img_feat_size])

		sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1])
		feat_concat = tf.concat([sen_tile, vis_output], 3)

		feat_proj_init = msr_init([1, 1, self.lstm_dim+self.img_feat_size, self.hidden_size])
		feat_proj = conv("feat_proj", feat_concat, 1, 1, self.hidden_size, weights_initializer=feat_proj_init)
		feat_relu = tf.nn.relu(feat_proj)

		att_conv_init = msr_init([1, 1, self.hidden_size, 1])
		att_conv = conv("att_conv", feat_relu, 1, 1, 1, weights_initializer=att_conv_init)
		att_scores = tf.reshape(att_conv, [self.batch_size, self.num_prop])

		return att_scores
Ejemplo n.º 6
0
    def model_structure(self,
                        sen_data,
                        enc_data,
                        dec_data,
                        msk_data,
                        vis_data,
                        batch_size,
                        is_train,
                        dropout=None):
        def set_drop_test():
            return tf.cast(1.0, tf.float32)

        def set_drop_train():
            return tf.cast(self.dropout, tf.float32)

        dropout = tf.cond(is_train, set_drop_train, set_drop_test)

        seq_length = tf.reduce_sum(msk_data, 1)
        text_seq_batch = sen_data

        with tf.variable_scope('word_embedding'), tf.device("/cpu:0"):
            embedding_mat = tf.get_variable(
                "embedding", [self.vocab_size, self.lstm_dim],
                tf.float32,
                initializer=tf.contrib.layers.xavier_initializer(uniform=True))
            # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
            embedded_seq = tf.nn.embedding_lookup(embedding_mat,
                                                  text_seq_batch)

        # we encode phrase based on the last step of hidden states
        outputs, states = lstm('enc_lstm',
                               embedded_seq,
                               None,
                               seq_length,
                               output_dim=self.lstm_dim,
                               num_layers=1,
                               forget_bias=1.0,
                               apply_dropout=True,
                               keep_prob=dropout,
                               concat_output=False,
                               initializer=tf.random_uniform_initializer(
                                   minval=-0.08, maxval=0.08))

        sen_raw = states[-1].h
        sen_raw = tf.nn.l2_normalize(sen_raw, dim=1)

        # print sen_raw.get_shape()
        vis_raw = tf.reshape(
            vis_data, [self.batch_size * self.num_prop, self.img_feat_size])

        sen_output = tf.reshape(sen_raw,
                                [self.batch_size, 1, 1, self.lstm_dim])
        vis_output = tf.reshape(
            vis_raw, [self.batch_size, self.num_prop, 1, self.img_feat_size])

        sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1])
        feat_concat = tf.concat([sen_tile, vis_output], 3)

        feat_proj_init = msr_init(
            [1, 1, self.lstm_dim + self.img_feat_size, self.hidden_size])
        feat_proj = conv("feat_proj",
                         feat_concat,
                         1,
                         1,
                         self.hidden_size,
                         weights_initializer=feat_proj_init)
        feat_relu = tf.nn.relu(feat_proj)

        att_conv_init = msr_init([1, 1, self.hidden_size, 1])
        att_conv = conv("att_conv",
                        feat_relu,
                        1,
                        1,
                        1,
                        weights_initializer=att_conv_init)

        #Generate the visual attention feature
        att_scores_t = tf.reshape(att_conv, [self.batch_size, self.num_prop])
        # att_prob = tf.nn.softmax(att_scores_t)
        att_prob = tf.nn.relu(att_scores_t)

        att_scores = tf.reshape(att_prob, [self.batch_size, self.num_prop, 1])

        vis_att_feat = tf.reduce_sum(
            tf.multiply(vis_data,
                        tf.tile(att_scores, [1, 1, self.img_feat_size])), 1)
        vis_att_featFC = fc_relu(
            "vis_enc",
            vis_att_feat,
            self.lstm_dim,
            weights_initializer=tf.random_uniform_initializer(minval=-0.002,
                                                              maxval=0.002))

        vis_att_tile = tf.reshape(vis_att_featFC,
                                  [self.batch_size, 1, self.lstm_dim])

        text_enc_batch = enc_data
        # embedded_enc: batch_size x phrase_len x lstm_dim
        with tf.variable_scope('enc_embedding'), tf.device("/cpu:0"):
            embedding_enc = tf.get_variable(
                "embedding", [self.vocab_size, self.lstm_dim],
                tf.float32,
                initializer=tf.contrib.layers.xavier_initializer(uniform=True))
            # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
            embedded_enc = tf.nn.embedding_lookup(embedding_enc,
                                                  text_enc_batch)

        # dec_vis_embed = batch_size x phrase_len x (2*lstm_dim)
        dec_vis_embed = tf.concat([
            embedded_enc,
            tf.concat([
                vis_att_tile,
                tf.zeros((self.batch_size, self.phrase_len - 1, self.lstm_dim))
            ], 1)
        ], 2)
        # dec_outputs: batch_size x phrase_len x lstm_dim
        dec_outs, _ = lstm('dec_lstm',
                           dec_vis_embed,
                           None,
                           seq_length,
                           output_dim=self.lstm_dim,
                           num_layers=1,
                           forget_bias=1.0,
                           apply_dropout=True,
                           keep_prob=dropout,
                           concat_output=True,
                           initializer=tf.random_uniform_initializer(
                               minval=-0.08, maxval=0.08))

        dec_outs = tf.reshape(
            dec_outs, [self.batch_size * self.phrase_len, self.lstm_dim])
        # dec_logits: (batch_size*phrase_len) x vocab_size
        dec_logits = fc(
            'dec_logits',
            dec_outs,
            self.vocab_size,
            weights_initializer=tf.contrib.layers.xavier_initializer(
                uniform=True))

        return att_scores_t, dec_logits, vis_data