Exemple #1
0
def rpn_net(conv5,
            im_info,
            name,
            feat_stride=16,
            anchor_scales=(8, 16, 32),
            phase='TEST'):
    with tf.variable_scope(name):
        # rpn_conv/3x3
        rpn_conv = conv_relu('rpn_conv/3x3',
                             conv5,
                             kernel_size=3,
                             stride=1,
                             output_dim=512)
        # rpn_cls_score
        # Note that we've already subtracted the bg weights from fg weights
        # and do sigmoid instead of softmax (actually sigmoid is not needed
        # for ranking)
        rpn_cls_score = conv('rpn_cls_score',
                             rpn_conv,
                             kernel_size=1,
                             stride=1,
                             output_dim=len(anchor_scales) * 3)
        # rpn_bbox_pred
        rpn_bbox_pred = conv('rpn_bbox_pred',
                             rpn_conv,
                             kernel_size=1,
                             stride=1,
                             output_dim=len(anchor_scales) * 3 * 4)

        rois = tf.py_func(ProposalLayer(feat_stride, anchor_scales, phase),
                          [rpn_cls_score, rpn_bbox_pred, im_info],
                          [tf.float32],
                          stateful=False)[0]
        rois.set_shape([None, 5])
        return rois
def build_output_unit_loc(q_encoding, kb_batch, att_last,
                          scope='output_unit_loc', reuse=None):
    """
    Apply a 1-layer convolution network to predict localization scores.
    Apply dropout
    if specified.

    Input:
        kb_batch: [N, H, W, d], tf.float32
        att_last: [N, H, W, 1], tf.float32
    Return:
        loc_scores: [N, H*W], tf.float32
        bbox_offset: [N, 4], tf.float32
    """

    with tf.variable_scope(scope, reuse=reuse):
        if cfg.MODEL.LOC_SCORES_POS_AFFINE:
            # make sure att signs do not flip
            w = tf.abs(tf.get_variable('loc_scores_affine_raw_w', []))
            b = tf.get_variable('loc_scores_affine_b', [])
            loc_scores = w * att_last + b
        else:
            loc_scores = conv(
                'conv_loc', att_last, kernel_size=3, stride=1, output_dim=1)
        loc_scores = tf.reshape(
            loc_scores, [-1, cfg.MODEL.H_FEAT*cfg.MODEL.W_FEAT])
        # extract the attended features for bounding box regression
        if cfg.MODEL.BBOX_REG_AS_FCN:
            if cfg.MODEL.BBOX_REG_USE_QUESTION:
                q_mapped = fc(
                    'fc_q_mapped', q_encoding, output_dim=cfg.MODEL.KB_DIM)
                bbox_offset_input = tf.nn.l2_normalize(
                    q_mapped[:, ax, ax, :] * kb_batch, axis=-1)
            else:
                bbox_offset_input = kb_batch
            bbox_offset_fcn = conv(
                'conv_bbox_offset', bbox_offset_input, 1, 1, output_dim=4)
            N = tf.shape(bbox_offset_fcn)[0]
            B = cfg.MODEL.H_FEAT*cfg.MODEL.W_FEAT  # B = H*W
            # bbox_offset_fcn [N, B, 4] is used for training
            bbox_offset_fcn = tf.reshape(bbox_offset_fcn, to_T([N, B, 4]))
            # bbox_offset [N, 4] is only used for prediction
            bbox_offset_flat = tf.reshape(bbox_offset_fcn, to_T([N*B, 4]))
            slice_inds = tf.range(N) * B + tf.argmax(
                loc_scores, axis=-1, output_type=tf.int32)
            bbox_offset = tf.gather(bbox_offset_flat, slice_inds)
        else:
            bbox_offset_fcn = None
            kb_loc = _extract_softmax_avg(kb_batch, att_last)
            if cfg.MODEL.BBOX_REG_USE_QUESTION:
                q_mapped = fc(
                    'fc_q_mapped', q_encoding, output_dim=cfg.MODEL.KB_DIM)
                elt_prod = tf.nn.l2_normalize(q_mapped * kb_loc, axis=-1)
                bbox_offset = fc(
                    'fc_bbox_offset_with_q', elt_prod, output_dim=4)
            else:
                bbox_offset = fc('fc_bbox_offset', kb_loc, output_dim=4)
    return loc_scores, bbox_offset, bbox_offset_fcn
Exemple #3
0
def build_kb_batch(image_feat_batch, scope='kb_batch', reuse=None):
    """
    Concatenation image batch and position encoding batch, and apply a 2-layer
    CNN on top of it.

    Input:
        image_feat_batch: [N, H, W, C], tf.float32
    Return:
        kb_batch: [N, H, W, d], tf.float32
    """

    kb_dim = cfg.MODEL.KB_DIM
    with tf.variable_scope(scope, reuse=reuse):
        if cfg.MODEL.INPUT.USE_L2_NORMALIZATION:
            norm_type = cfg.MODEL.INPUT.L2_NORMALIZATION_TYPE
            if norm_type == 'global':
                # Normalize along H, W, C
                image_feat_batch = tf.nn.l2_normalize(image_feat_batch,
                                                      axis=[1, 2, 3])
            elif norm_type == 'local':
                # Normalize along C
                image_feat_batch = tf.nn.l2_normalize(image_feat_batch,
                                                      axis=-1)
            else:
                raise ValueError('Invalid l2 normalization type: ' + norm_type)

        if cfg.MODEL.INPUT.USE_POSITION_ENCODING:
            # get positional encoding
            N = tf.shape(image_feat_batch)[0]

            _, H, W, _ = image_feat_batch.get_shape().as_list()
            position_encoding = to_T(get_positional_encoding(H, W),
                                     dtype=tf.float32)
            position_batch = tf.tile(position_encoding, to_T([N, 1, 1, 1]))

            # apply a two layer convnet with ELU activation
            conv1 = conv_elu('conv1',
                             tf.concat([image_feat_batch, position_batch],
                                       axis=3),
                             kernel_size=1,
                             stride=1,
                             output_dim=kb_dim)
            conv2 = conv('conv2',
                         conv1,
                         kernel_size=1,
                         stride=1,
                         output_dim=kb_dim)

            kb_batch = conv2
        else:
            kb_batch = conv('conv_no_pe',
                            image_feat_batch,
                            kernel_size=1,
                            stride=1,
                            output_dim=kb_dim)
    return kb_batch
Exemple #4
0
	def model_structure(self, sen_data, vis_data, batch_size, is_train, dropout=None):
		if dropout == None:
			dropout = self.dropout

		text_seq_batch = tf.transpose(sen_data, [1, 0])	# input data is [num_steps, batch_size]
		with tf.variable_scope('word_embedding'), tf.device("/cpu:0"):
			if self.embed_w is None:
				initializer = tf.contrib.layers.xavier_initializer(uniform=True)
			else:
				initializer = tf.constant_initializer(self.embed_w)
			embedding_mat = tf.get_variable("embedding", [self.vocab_size, self.lstm_dim], tf.float32,
				initializer=initializer)
			# text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
			embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch)

		# encode phrase based on the last step of hidden states
		outputs, _, _ = bi_lstm('lstm_lang', embedded_seq, None, output_dim=self.lstm_dim,
						num_layers=1, forget_bias=1.0, apply_dropout=False,concat_output=False,
						initializer=tf.random_uniform_initializer(minval=-0.08, maxval=0.08))

		sen_raw = outputs[-1]
		vis_raw = tf.reshape(vis_data, [self.batch_size*self.num_prop, self.img_feat_size])

		sen_bn = bn(sen_raw, is_train, "SEN_BN", 0.9)
		vis_bn = bn(vis_raw, is_train, "VIS_BN", 0.9)

		sen_output = tf.reshape(sen_bn, [self.batch_size, 1, 1, 2*self.lstm_dim])	# bi-directional lstm: hidden_size double
		vis_output = tf.reshape(vis_bn, [self.batch_size, self.num_prop, 1, self.img_feat_size])

		sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1])
		feat_concat = tf.concat(3, [sen_tile, vis_output])

		feat_proj_init = msr_init([1, 1, 2*self.lstm_dim+self.img_feat_size, self.hidden_size])
		feat_proj = conv("feat_proj", feat_concat, 1, 1, self.hidden_size, weights_initializer=feat_proj_init)
		feat_relu = tf.nn.relu(feat_proj)

		att_conv_init = msr_init([1, 1, self.hidden_size, 5])
		att_conv = conv("att_conv", feat_relu, 1, 1, 5, weights_initializer=att_conv_init)
		att_scores = tf.reshape(att_conv, [self.batch_size, self.num_prop, 5])

		att_logits = tf.reshape(att_scores[:, :, 0], [self.batch_size, self.num_prop])
		_, pred_ind = tf.nn.top_k(att_logits, self.top_k)
		pred_ind = tf.reshape(pred_ind, [self.batch_size*self.top_k, 1])
		row_ind = tf.reshape(tf.range(0, self.batch_size), [-1, 1])
		row_ind = tf.reshape(tf.tile(row_ind, [1, self.top_k]), [self.top_k*self.batch_size, 1])
		pred_ind = tf.concat(1, [row_ind, pred_ind])	
		
		# (batch_size*top_k) x img_feat_size
		vis_top = tf.gather_nd(tf.reshape(vis_output, [self.batch_size, self.num_prop, self.img_feat_size]), pred_ind)
		vis_ref = tf.reduce_mean(tf.reshape(vis_top, [self.batch_size, self.top_k, self.img_feat_size]), 1)
		ref_feat = tf.concat(1, [vis_ref, sen_bn])
		# ref_feat = vis_ref
		reward_pred = tf.reshape(tf.sigmoid(fc('reward_pred', ref_feat, 1)),[self.batch_size])

		return att_scores, reward_pred
Exemple #5
0
def text_objseg_full_conv(text_seq_batch, imcrop_batch, num_vocab, embed_dim,
    lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout):

    # Language feature (LSTM hidden state)
    feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)

    # Local image feature
    feat_vis = vgg_net.vgg_fc8_full_conv(imcrop_batch, 'vgg_local',
        apply_dropout=vgg_dropout)

    # Reshape and tile LSTM top
    featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3]
    N, D_text = feat_lang.get_shape().as_list()
    feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]),
        [1, featmap_H, featmap_W, 1])

    # L2-normalize the features (except for spatial_batch)
    # and concatenate them along axis 3 (channel dimension)
    spatial_batch = tf.convert_to_tensor(generate_spatial_batch(N, featmap_H, featmap_W))
    feat_all = tf.concat(3, [tf.nn.l2_normalize(feat_lang, 3),
                             tf.nn.l2_normalize(feat_vis, 3),
                             spatial_batch])

    # MLP Classifier over concatenate feature
    with tf.variable_scope('classifier'):
        mlp_l1 = conv_relu('mlp_l1', feat_all, kernel_size=1, stride=1,
            output_dim=mlp_hidden_dims)
        if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5)
        mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1)

    return mlp_l2
Exemple #6
0
def build_kb_batch(image_feat_batch, scope='kb_batch', reuse=None):
    """
    Concatenation image batch and position encoding batch, and apply a 2-layer
    CNN on top of it.

    Input:
        image_feat_batch: [N, H, W, C], tf.float32
    Return:
        kb_batch: [N, H, W, d], tf.float32
    """

    kb_dim = cfg.MODEL.KB_DIM
    with tf.variable_scope(scope, reuse=reuse):
        # get positional encoding
        N = tf.shape(image_feat_batch)[0]
        _, H, W, _ = image_feat_batch.get_shape().as_list()
        position_encoding = to_T(
            get_positional_encoding(H, W), dtype=tf.float32)
        position_batch = tf.tile(position_encoding, to_T([N, 1, 1, 1]))

        # apply a two layer convnet with ELU activation
        conv1 = conv_elu(
            'conv1', tf.concat([image_feat_batch, position_batch], axis=3),
            kernel_size=1, stride=1, output_dim=kb_dim)
        conv2 = conv(
            'conv2', conv1, kernel_size=1, stride=1, output_dim=kb_dim)

        kb_batch = conv2
    return kb_batch
Exemple #7
0
def _1x1conv(name, bottom, output_dim, reuse=None):
    return conv(name,
                bottom,
                kernel_size=1,
                stride=1,
                output_dim=output_dim,
                reuse=reuse)
Exemple #8
0
    def forward(self, imcrop_batch, text_seq_batch, is_training=True):
        num_vocab, embed_dim, lstm_dim, mlp_hidden_dims = self.num_vocab, self.embed_dim, self.lstm_dim, self.mlp_hidden_dims
        deeplab_dropout = self.kwargs[
            'deeplab_dropout'] if 'deeplab_dropout' in self.kwargs else False
        mlp_dropout = self.kwargs[
            'mlp_dropout'] if 'mlp_dropout' in self.kwargs else False

        with tf.variable_scope(self.model_name):
            # Language feature (LSTM hidden state)
            feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim,
                                          lstm_dim)[0]

            # Local image feature
            feat_vis = deeplab.deeplab_fc8_full_conv(
                imcrop_batch, 'deeplab', apply_dropout=deeplab_dropout)

            # Reshape and tile LSTM top
            featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3]
            N, D_text = feat_lang.get_shape().as_list()
            feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]),
                                [1, featmap_H, featmap_W, 1])

            # L2-normalize the features (except for spatial_batch)
            # and concatenate them along axis 3 (channel dimension)
            spatial_batch = tf.convert_to_tensor(
                generate_spatial_batch(N, featmap_H, featmap_W))
            feat_all = tf.concat(axis=3,
                                 values=[
                                     tf.nn.l2_normalize(feat_lang, 3),
                                     tf.nn.l2_normalize(feat_vis, 3),
                                     spatial_batch
                                 ])

            # MLP Classifier over concatenate feature
            with tf.variable_scope('classifier'):
                mlp_l1 = conv_relu('mlp_l1',
                                   feat_all,
                                   kernel_size=1,
                                   stride=1,
                                   output_dim=mlp_hidden_dims)
                if mlp_dropout:
                    mlp_l1 = drop(mlp_l1, 0.5)
                mlp_l2 = conv('mlp_l2',
                              mlp_l1,
                              kernel_size=1,
                              stride=1,
                              output_dim=1)

                upsample8s = deconv('upsample8s',
                                    mlp_l2,
                                    kernel_size=16,
                                    stride=8,
                                    output_dim=1,
                                    bias_term=False)

        return upsample8s
Exemple #9
0
def vgg_fc8_full_conv(input_batch,
                      name,
                      apply_dropout,
                      output_dim=1000,
                      reuse=None):
    fc7 = vgg_fc7_full_conv(input_batch, name, apply_dropout, reuse)
    with tf.variable_scope(name, reuse=reuse):
        # layer 8 (no ReLU after fc8)
        fc8 = conv('fc8', fc7, kernel_size=1, stride=1, output_dim=output_dim)
        return fc8
Exemple #10
0
def conv_net_bn(input_batch, name, phase):
    with tf.variable_scope(name):
        #conv1: 2*2@4/2
        conv1 = conv_relu_bn('conv1',
                             input_batch,
                             phase,
                             kernel_size=2,
                             stride=2,
                             output_dim=4)
        print("conv1: ", conv1)
        #conv2: 2*2@4/1
        conv2 = conv_relu_bn('conv2',
                             conv1,
                             phase,
                             kernel_size=2,
                             stride=1,
                             output_dim=4)
        print("conv2: ", conv2)
        #conv3: 2*2@8/2
        conv3 = conv_relu_bn('conv3',
                             conv2,
                             phase,
                             kernel_size=2,
                             stride=2,
                             output_dim=8)
        print("conv3: ", conv3)
        #conv4: 2*2@8/1
        conv4 = conv_relu_bn('conv4',
                             conv3,
                             phase,
                             kernel_size=2,
                             stride=1,
                             output_dim=8)
        print("conv4: ", conv4)
        #conv5: 2*2@8/2
        conv5 = conv_relu_bn('conv5',
                             conv4,
                             phase,
                             kernel_size=2,
                             stride=2,
                             output_dim=8)
        print("conv5: ", conv5)
        #conv6: 2*2@8/1 tanh
        conv6 = conv('conv6', conv5, kernel_size=2, stride=1, output_dim=8)
        conv6 = tf.contrib.layers.batch_norm(conv6,
                                             center=True,
                                             scale=True,
                                             is_training=phase,
                                             scope='bn')
        print("conv6: ", conv6)
        tanh = tf.nn.tanh(conv6)

        return tanh
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False):
    with tf.variable_scope(name):
        if reuse==True:
            print name+" reuse variables"
            tf.get_variable_scope().reuse_variables()
        else:
            print name+" doesn't reuse variables"

        layer1 = conv_relu('layer1', input_batch,
                        kernel_size=1,stride=1,output_dim=middle_layer_dim)
        sim_score = conv('layer2', layer1,
                        kernel_size=1,stride=1,output_dim=3)
    return sim_score
Exemple #12
0
	def model_structure(self, sen_data, vis_data, batch_size, is_train, dropout=None):
		if dropout == None:
			dropout = self.dropout

		text_seq_batch = tf.transpose(sen_data, [1, 0])	# input data is [num_steps, batch_size]
		with tf.variable_scope('word_embedding'), tf.device("/cpu:0"):
			embedding_mat = tf.get_variable("embedding", [self.vocab_size, self.lstm_dim], tf.float32,
				initializer=tf.contrib.layers.xavier_initializer(uniform=True))
			# text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
			embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch)
		# we encode phrase based on the last step of hidden states
		_, states = lstm('lstm_lang', embedded_seq, None, output_dim=self.lstm_dim,
						num_layers=1, forget_bias=1.0, apply_dropout=False,concat_output=False,
						initializer=tf.random_uniform_initializer(minval=-0.08, maxval=0.08))

		# batch normalization for visual and language part
		sen_raw = states[-1].h
		vis_raw = tf.reshape(vis_data, [self.batch_size*self.num_prop, self.img_feat_size])

		sen_bn = bn(sen_raw, is_train, "SEN_BN", 0.9)
		vis_bn = bn(vis_raw, is_train, "VIS_BN", 0.9)

		sen_output = tf.reshape(sen_bn, [self.batch_size, 1, 1, self.lstm_dim])
		vis_output = tf.reshape(vis_bn, [self.batch_size, self.num_prop, 1, self.img_feat_size])

		sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1])
		feat_concat = tf.concat([sen_tile, vis_output], 3)

		feat_proj_init = msr_init([1, 1, self.lstm_dim+self.img_feat_size, self.hidden_size])
		feat_proj = conv("feat_proj", feat_concat, 1, 1, self.hidden_size, weights_initializer=feat_proj_init)
		feat_relu = tf.nn.relu(feat_proj)

		att_conv_init = msr_init([1, 1, self.hidden_size, 1])
		att_conv = conv("att_conv", feat_relu, 1, 1, 1, weights_initializer=att_conv_init)
		att_scores = tf.reshape(att_conv, [self.batch_size, self.num_prop])

		return att_scores
Exemple #13
0
    def loc_init(self, images, scope='kb_batch', reuse=None):
        """
        Linearly transform the input features to a fixed dimension MODEL.KB_DIM
        """
        with tf.variable_scope(scope, reuse=reuse):
            if cfg.STEM_NORMALIZE:
                images = tf.nn.l2_normalize(images, axis=-1)

            # apply a single layer convnet
            conv1 = conv('conv1',
                         images,
                         kernel_size=1,
                         stride=1,
                         output_dim=cfg.LOC_DIM)
        return conv1
Exemple #14
0
    def __init__(self,
                 images,
                 q_encoding,
                 image_valid_batch,
                 num_choices,
                 scope='single_hop',
                 reuse=None):

        x_loc = self.loc_init(images, reuse=reuse)

        with tf.variable_scope(scope, reuse=reuse):
            x_loc_shape = tf.shape(x_loc)
            B, H, W = x_loc_shape[0], x_loc_shape[1], x_loc_shape[2]
            dim = x_loc.get_shape().as_list()[-1]  # static shape

            # attention over x_loc
            proj_q = fc('fc_q_map1', q_encoding, output_dim=dim)[:, ax, ax, :]
            interactions = tf.nn.l2_normalize(x_loc * proj_q, axis=-1)
            raw_att = conv('conv_att_score',
                           interactions,
                           kernel_size=1,
                           stride=1,
                           output_dim=1)
            raw_att = tf.reshape(raw_att, to_T([B, H * W]))  # (N, H*W)
            valid_mask = tf.reshape(image_valid_batch, tf.shape(raw_att))
            raw_att = raw_att * valid_mask - 1e18 * (1 - valid_mask)
            att = tf.nn.softmax(raw_att, axis=-1)  # (N, H*W)

            # collect attended image feature
            x_att = tf.matmul(tf.reshape(att, to_T([B, 1, H * W])),
                              tf.reshape(x_loc, to_T([B, H * W,
                                                      dim])))  # (N, 1, D_kb)
            x_att = tf.reshape(x_att, to_T([B, dim]))  # (N, D_kb)

            # VQA classification
            eQ = fc('fc_q_map2', q_encoding, output_dim=dim)
            if cfg.OUT_QUESTION_MUL:
                features = tf.concat([x_att, eQ, x_att * eQ], axis=-1)
            else:
                features = tf.concat([x_att, eQ], axis=-1)

            fc1 = fc_relu('fc_hidden',
                          features,
                          output_dim=cfg.OUT_CLASSIFIER_DIM)
            logits = fc('fc_scores', fc1, output_dim=num_choices)
            self.logits = logits
def text_objseg_full_conv(text_seq_batch, imcrop_batch, num_vocab, embed_dim,
                          lstm_dim, mlp_hidden_dims, deeplab_dropout,
                          mlp_dropout, is_training):

    # Language feature (LSTM hidden state)
    feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim,
                                  lstm_dim)[0]

    #deeplab101
    net = deeplab101.DeepLabResNetModel({'data': imcrop_batch},
                                        is_training=is_training)
    feat_vis = net.layers['fc1_voc12']

    # # Local image feature
    # feat_vis = deeplab.deeplab_fc8_full_conv(imcrop_batch, 'deeplab',
    #     apply_dropout=deeplab_dropout)

    # Reshape and tile LSTM top
    featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3]
    N, D_text = feat_lang.get_shape().as_list()
    feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]),
                        [1, featmap_H, featmap_W, 1])

    # L2-normalize the features (except for spatial_batch)
    # and concatenate them along axis 3 (channel dimension)
    spatial_batch = tf.convert_to_tensor(
        generate_spatial_batch(N, featmap_H, featmap_W))
    feat_all = tf.concat(axis=3,
                         values=[
                             tf.nn.l2_normalize(feat_lang, 3),
                             tf.nn.l2_normalize(feat_vis, 3), spatial_batch
                         ])

    # MLP Classifier over concatenate feature
    with tf.variable_scope('classifier'):
        mlp_l1 = conv_relu('mlp_l1',
                           feat_all,
                           kernel_size=1,
                           stride=1,
                           output_dim=mlp_hidden_dims)
        if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5)
        mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1)

    return mlp_l2
Exemple #16
0
def conv_net(input_batch, name):
    with tf.variable_scope(name):
        #conv1: 2*2@4/2
        conv1 = conv_relu('conv1',
                          input_batch,
                          kernel_size=2,
                          stride=2,
                          output_dim=4)
        print("conv1: ", conv1)
        #conv2: 2*2@4/1
        conv2 = conv_relu('conv2',
                          conv1,
                          kernel_size=2,
                          stride=1,
                          output_dim=4)
        print("conv2: ", conv2)
        #conv3: 2*2@8/2
        conv3 = conv_relu('conv3',
                          conv2,
                          kernel_size=2,
                          stride=2,
                          output_dim=8)
        print("conv3: ", conv3)
        #conv4: 2*2@8/1
        conv4 = conv_relu('conv4',
                          conv3,
                          kernel_size=2,
                          stride=1,
                          output_dim=8)
        print("conv4: ", conv4)
        #conv5: 2*2@8/2
        conv5 = conv_relu('conv5',
                          conv4,
                          kernel_size=2,
                          stride=2,
                          output_dim=8)
        print("conv5: ", conv5)
        #conv6: 2*2@8/1 tanh
        conv6 = conv('conv6', conv5, kernel_size=2, stride=1, output_dim=8)
        print("conv6: ", conv6)
        tanh = tf.nn.tanh(conv6)

        return tanh
Exemple #17
0
def _conv(name,
          bottom,
          kernel_size,
          stride,
          output_dim,
          padding='SAME',
          bias_term=True,
          weights_initializer=None,
          biases_initializer=None,
          reuse=None):
    g = tf.get_default_graph()
    with g.gradient_override_map({'Conv2D': 'Conv2D_handle_empty_batch'}):
        return conv(name,
                    bottom,
                    kernel_size,
                    stride,
                    output_dim,
                    padding,
                    bias_term,
                    weights_initializer,
                    biases_initializer,
                    reuse=reuse)
def recurrent_multimodal(text_seq_batch, imcrop_batch, num_vocab, embed_dim,
                         lstm_dim, mlp_hidden_dims, feature_vis_dropout,
                         mlp_dropout):

    _, feat_langs, embedded_seq = lstm_net.lstm_net(text_seq_batch, num_vocab,
                                                    embed_dim, lstm_dim)

    # feat_vis = vgg_net.vgg_fc8_full_conv(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout)
    feat_vis = deeplab.deeplab_fc8_full_conv(imcrop_batch,
                                             'deeplab',
                                             feature_vis_dropout,
                                             output_dim=1000)

    featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3]

    # Reshape and tile feat_langs, embedded_seq
    T, N, D_text = embedded_seq.get_shape().as_list()
    feat_langs = [
        tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]),
                [1, featmap_H, featmap_W, 1]) for feat_lang in feat_langs
    ]

    embedded_seq = [
        tf.tile(tf.reshape(_embedded_seq, (N, 1, 1, embed_dim)),
                [1, featmap_H, featmap_W, 1])
        for _embedded_seq in tf.split(embedded_seq, T, 0)
    ]

    # L2-normalize the features (except for spatial_batch)
    # and concatenate them along axis 3 (channel dimension)
    spatial_batch = tf.convert_to_tensor(
        generate_spatial_batch(N, featmap_H, featmap_W))

    #concat all features
    feat_alls = []
    for i in range(T):
        feat_alls.append(
            tf.concat([
                tf.nn.l2_normalize(feat_langs[i], 3),
                tf.nn.l2_normalize(feat_vis, 3), spatial_batch
            ], 3))
        #feat_alls.append(tf.concat([feat_langs[i], feat_vis, spatial_batch], 3))

    feat_all = tf.stack(feat_alls, 3)
    feat_all = tf.transpose(feat_all, [0, 3, 1, 2, 4])
    print(feat_all.shape)

    #mlstm
    print(tf.get_variable_scope().reuse)
    mlstm_top = rnn.mlstm_layer('mlstm', feat_all, None, 500)[0]
    print(tf.get_variable_scope().reuse)

    #MLP classfier
    with tf.variable_scope('classifier'):
        mlp_l1 = conv('mlp_l1',
                      mlstm_top,
                      kernel_size=1,
                      stride=1,
                      output_dim=1)
        #if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5)
        #mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1)

    return mlp_l1
Exemple #19
0
    def __init__(self, image_feat_grid, text_seq_batch, seq_length_batch,
        T_decoder, num_vocab_txt, embed_dim_txt, num_vocab_nmn,
        embed_dim_nmn, lstm_dim, num_layers, assembler,
        encoder_dropout, decoder_dropout, decoder_sampling,
        num_choices, use_qpn, qpn_dropout, reduce_visfeat_dim=False, new_visfeat_dim=256,
        use_gt_layout=None, gt_layout_batch=None,
        scope='neural_module_network', reuse=None):

        with tf.variable_scope(scope, reuse=reuse):
            # Part 0: Visual feature from CNN
            self.reduce_visfeat_dim = reduce_visfeat_dim
            if reduce_visfeat_dim:
                # use an extrac linear 1x1 conv layer (without ReLU)
                # to reduce the feature dimension
                with tf.variable_scope('reduce_visfeat_dim'):
                    image_feat_grid = conv('conv_reduce_visfeat_dim',
                        image_feat_grid, kernel_size=1, stride=1,
                        output_dim=new_visfeat_dim)
                print('visual feature dimension reduced to %d' % new_visfeat_dim)
            self.image_feat_grid = image_feat_grid

            # Part 1: Seq2seq RNN to generate module layout tokensa
            with tf.variable_scope('layout_generation'):
                att_seq2seq = AttentionSeq2Seq(text_seq_batch,
                    seq_length_batch, T_decoder, num_vocab_txt,
                    embed_dim_txt, num_vocab_nmn, embed_dim_nmn, lstm_dim,
                    num_layers, assembler, encoder_dropout, decoder_dropout,
                    decoder_sampling, use_gt_layout, gt_layout_batch)
                self.att_seq2seq = att_seq2seq
                predicted_tokens = att_seq2seq.predicted_tokens
                token_probs = att_seq2seq.token_probs
                word_vecs = att_seq2seq.word_vecs
                neg_entropy = att_seq2seq.neg_entropy
                self.atts = att_seq2seq.atts

                self.predicted_tokens = predicted_tokens
                self.token_probs = token_probs
                self.word_vecs = word_vecs
                self.neg_entropy = neg_entropy

                # log probability of each generated sequence
                self.log_seq_prob = tf.reduce_sum(tf.log(token_probs), axis=0)

            # Part 2: Neural Module Network
            with tf.variable_scope('layout_execution'):
                modules = Modules(image_feat_grid, word_vecs, None, num_choices)
                self.modules = modules
                # Recursion of modules
                att_shape = image_feat_grid.get_shape().as_list()[1:-1] + [1]
                # Forward declaration of module recursion
                att_expr_decl = td.ForwardDeclaration(td.PyObjectType(), td.TensorType(att_shape))
                # _Scene
                case_scene = td.Record([('time_idx', td.Scalar(dtype='int32')),
                                       ('batch_idx', td.Scalar(dtype='int32'))])
                case_scene = case_scene >> td.Function(modules.SceneModule)
                # _Find
                case_find = td.Record([('time_idx', td.Scalar(dtype='int32')),
                                       ('batch_idx', td.Scalar(dtype='int32'))])
                case_find = case_find >> td.Function(modules.FindModule)
                # _Filter
                case_filter = td.Record([('input_0', att_expr_decl()),
                                         ('time_idx', td.Scalar(dtype='int32')),
                                         ('batch_idx', td.Scalar(dtype='int32'))])
                case_filter = case_filter >> td.Function(modules.FilterModule)
                # _FindSameProperty
                case_find_same_property = td.Record([('input_0', att_expr_decl()),
                                                     ('time_idx', td.Scalar(dtype='int32')),
                                                     ('batch_idx', td.Scalar(dtype='int32'))])
                case_find_same_property = case_find_same_property >> \
                    td.Function(modules.FindSamePropertyModule)
                # _Transform
                case_transform = td.Record([('input_0', att_expr_decl()),
                                            ('time_idx', td.Scalar('int32')),
                                            ('batch_idx', td.Scalar('int32'))])
                case_transform = case_transform >> td.Function(modules.TransformModule)
                # _And
                case_and = td.Record([('input_0', att_expr_decl()),
                                      ('input_1', att_expr_decl()),
                                      ('time_idx', td.Scalar('int32')),
                                      ('batch_idx', td.Scalar('int32'))])
                case_and = case_and >> td.Function(modules.AndModule)
                # _Or
                case_or = td.Record([('input_0', att_expr_decl()),
                                     ('input_1', att_expr_decl()),
                                     ('time_idx', td.Scalar('int32')),
                                     ('batch_idx', td.Scalar('int32'))])
                case_or = case_or >> td.Function(modules.OrModule)
                # _Exist
                case_exist = td.Record([('input_0', att_expr_decl()),
                                        ('time_idx', td.Scalar('int32')),
                                        ('batch_idx', td.Scalar('int32'))])
                case_exist = case_exist >> td.Function(modules.ExistModule)
                # _Count
                case_count = td.Record([('input_0', att_expr_decl()),
                                        ('time_idx', td.Scalar('int32')),
                                        ('batch_idx', td.Scalar('int32'))])
                case_count = case_count >> td.Function(modules.CountModule)
                # _EqualNum
                case_equal_num = td.Record([('input_0', att_expr_decl()),
                                            ('input_1', att_expr_decl()),
                                            ('time_idx', td.Scalar('int32')),
                                            ('batch_idx', td.Scalar('int32'))])
                case_equal_num = case_equal_num >> td.Function(modules.EqualNumModule)
                # _MoreNum
                case_more_num = td.Record([('input_0', att_expr_decl()),
                                            ('input_1', att_expr_decl()),
                                            ('time_idx', td.Scalar('int32')),
                                            ('batch_idx', td.Scalar('int32'))])
                case_more_num = case_more_num >> td.Function(modules.MoreNumModule)
                # _LessNum
                case_less_num = td.Record([('input_0', att_expr_decl()),
                                            ('input_1', att_expr_decl()),
                                            ('time_idx', td.Scalar('int32')),
                                            ('batch_idx', td.Scalar('int32'))])
                case_less_num = case_less_num >> td.Function(modules.LessNumModule)
                # _SameProperty
                case_same_property = td.Record([('input_0', att_expr_decl()),
                                                ('input_1', att_expr_decl()),
                                                ('time_idx', td.Scalar('int32')),
                                                ('batch_idx', td.Scalar('int32'))])
                case_same_property = case_same_property >> \
                    td.Function(modules.SamePropertyModule)
                # _Describe
                case_describe = td.Record([('input_0', att_expr_decl()),
                                           ('time_idx', td.Scalar('int32')),
                                           ('batch_idx', td.Scalar('int32'))])
                case_describe = case_describe >> \
                    td.Function(modules.DescribeModule)

                recursion_cases = td.OneOf(td.GetItem('module'), {
                    '_Scene': case_scene,
                    '_Find': case_find,
                    '_Filter': case_filter,
                    '_FindSameProperty': case_find_same_property,
                    '_Transform': case_transform,
                    '_And': case_and,
                    '_Or': case_or})
                att_expr_decl.resolve_to(recursion_cases)

                # For invalid expressions, define a dummy answer
                # so that all answers have the same form
                dummy_scores = td.Void() >> td.FromTensor(np.zeros(num_choices, np.float32))
                output_scores = td.OneOf(td.GetItem('module'), {
                    '_Exist': case_exist,
                    '_Count': case_count,
                    '_EqualNum': case_equal_num,
                    '_MoreNum': case_more_num,
                    '_LessNum': case_less_num,
                    '_SameProperty': case_same_property,
                    '_Describe': case_describe,
                    INVALID_EXPR: dummy_scores})

                # compile and get the output scores
                self.compiler = td.Compiler.create(output_scores)
                self.scores_nmn = self.compiler.output_tensors[0]

            # Add a question prior network if specified
            self.use_qpn = use_qpn
            self.qpn_dropout = qpn_dropout
            if use_qpn:
                self.scores_qpn = question_prior_net(att_seq2seq.encoder_states,
                                                     num_choices, qpn_dropout)
                self.scores = self.scores_nmn + self.scores_qpn
            else:
                self.scores = self.scores_nmn

            # Regularization: Entropy + L2
            self.entropy_reg = tf.reduce_mean(neg_entropy)
            module_weights = [v for v in tf.trainable_variables()
                              if (scope in v.op.name and
                                  v.op.name.endswith('weights'))]
            self.l2_reg = tf.add_n([tf.nn.l2_loss(v) for v in module_weights])
Exemple #20
0
    def model_structure(self,
                        sen_data,
                        enc_data,
                        dec_data,
                        msk_data,
                        vis_data,
                        batch_size,
                        is_train,
                        dropout=None):
        def set_drop_test():
            return tf.cast(1.0, tf.float32)

        def set_drop_train():
            return tf.cast(self.dropout, tf.float32)

        dropout = tf.cond(is_train, set_drop_train, set_drop_test)

        seq_length = tf.reduce_sum(msk_data, 1)
        text_seq_batch = sen_data

        with tf.variable_scope('word_embedding'), tf.device("/cpu:0"):
            embedding_mat = tf.get_variable(
                "embedding", [self.vocab_size, self.lstm_dim],
                tf.float32,
                initializer=tf.contrib.layers.xavier_initializer(uniform=True))
            # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
            embedded_seq = tf.nn.embedding_lookup(embedding_mat,
                                                  text_seq_batch)

        # we encode phrase based on the last step of hidden states
        outputs, states = lstm('enc_lstm',
                               embedded_seq,
                               None,
                               seq_length,
                               output_dim=self.lstm_dim,
                               num_layers=1,
                               forget_bias=1.0,
                               apply_dropout=True,
                               keep_prob=dropout,
                               concat_output=False,
                               initializer=tf.random_uniform_initializer(
                                   minval=-0.08, maxval=0.08))

        sen_raw = states[-1].h
        sen_raw = tf.nn.l2_normalize(sen_raw, dim=1)

        # print sen_raw.get_shape()
        vis_raw = tf.reshape(
            vis_data, [self.batch_size * self.num_prop, self.img_feat_size])

        sen_output = tf.reshape(sen_raw,
                                [self.batch_size, 1, 1, self.lstm_dim])
        vis_output = tf.reshape(
            vis_raw, [self.batch_size, self.num_prop, 1, self.img_feat_size])

        sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1])
        feat_concat = tf.concat([sen_tile, vis_output], 3)

        feat_proj_init = msr_init(
            [1, 1, self.lstm_dim + self.img_feat_size, self.hidden_size])
        feat_proj = conv("feat_proj",
                         feat_concat,
                         1,
                         1,
                         self.hidden_size,
                         weights_initializer=feat_proj_init)
        feat_relu = tf.nn.relu(feat_proj)

        att_conv_init = msr_init([1, 1, self.hidden_size, 1])
        att_conv = conv("att_conv",
                        feat_relu,
                        1,
                        1,
                        1,
                        weights_initializer=att_conv_init)

        #Generate the visual attention feature
        att_scores_t = tf.reshape(att_conv, [self.batch_size, self.num_prop])
        # att_prob = tf.nn.softmax(att_scores_t)
        att_prob = tf.nn.relu(att_scores_t)

        att_scores = tf.reshape(att_prob, [self.batch_size, self.num_prop, 1])

        vis_att_feat = tf.reduce_sum(
            tf.multiply(vis_data,
                        tf.tile(att_scores, [1, 1, self.img_feat_size])), 1)
        vis_att_featFC = fc_relu(
            "vis_enc",
            vis_att_feat,
            self.lstm_dim,
            weights_initializer=tf.random_uniform_initializer(minval=-0.002,
                                                              maxval=0.002))

        vis_att_tile = tf.reshape(vis_att_featFC,
                                  [self.batch_size, 1, self.lstm_dim])

        text_enc_batch = enc_data
        # embedded_enc: batch_size x phrase_len x lstm_dim
        with tf.variable_scope('enc_embedding'), tf.device("/cpu:0"):
            embedding_enc = tf.get_variable(
                "embedding", [self.vocab_size, self.lstm_dim],
                tf.float32,
                initializer=tf.contrib.layers.xavier_initializer(uniform=True))
            # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
            embedded_enc = tf.nn.embedding_lookup(embedding_enc,
                                                  text_enc_batch)

        # dec_vis_embed = batch_size x phrase_len x (2*lstm_dim)
        dec_vis_embed = tf.concat([
            embedded_enc,
            tf.concat([
                vis_att_tile,
                tf.zeros((self.batch_size, self.phrase_len - 1, self.lstm_dim))
            ], 1)
        ], 2)
        # dec_outputs: batch_size x phrase_len x lstm_dim
        dec_outs, _ = lstm('dec_lstm',
                           dec_vis_embed,
                           None,
                           seq_length,
                           output_dim=self.lstm_dim,
                           num_layers=1,
                           forget_bias=1.0,
                           apply_dropout=True,
                           keep_prob=dropout,
                           concat_output=True,
                           initializer=tf.random_uniform_initializer(
                               minval=-0.08, maxval=0.08))

        dec_outs = tf.reshape(
            dec_outs, [self.batch_size * self.phrase_len, self.lstm_dim])
        # dec_logits: (batch_size*phrase_len) x vocab_size
        dec_logits = fc(
            'dec_logits',
            dec_outs,
            self.vocab_size,
            weights_initializer=tf.contrib.layers.xavier_initializer(
                uniform=True))

        return att_scores_t, dec_logits, vis_data
Exemple #21
0
def vgg_fc8_full_conv(input_batch, name, apply_dropout, output_dim=1000):
    fc7 = vgg_fc7_full_conv(input_batch, name, apply_dropout)
    with tf.variable_scope(name):
        # layer 8 (no ReLU after fc8)
        fc8 = conv('fc8', fc7, kernel_size=1, stride=1, output_dim=output_dim)
        return fc8