def conv_net_shallow(input_batch, name): with tf.variable_scope(name): #conv1: 2*2@4/2 conv1 = conv_relu('conv1', input_batch, kernel_size=2, stride=2, output_dim=4) print("conv1: ", conv1) #conv2: 2*2@4/1 conv2 = conv_relu('conv2', conv1, kernel_size=2, stride=1, output_dim=8) print("conv2: ", conv2) #conv3: 2*2@8/2 conv3 = conv_relu('conv3', conv2, kernel_size=2, stride=2, output_dim=16) print("conv3: ", conv3) #conv4: 2*2@8/1 conv4 = conv_relu('conv4', conv3, kernel_size=2, stride=1, output_dim=16) print("conv4: ", conv4) #conv5: 2*2@8/2 tanh = tf.nn.tanh(conv4) return tanh
def vgg_fc7_full_conv(input_batch, name, apply_dropout): pool5 = vgg_pool5(input_batch, name) with tf.variable_scope(name): # layer 6 fc6 = conv_relu('fc6', pool5, kernel_size=7, stride=1, output_dim=4096) if apply_dropout: fc6 = drop(fc6, 0.5) # layer 7 fc7 = conv_relu('fc7', fc6, kernel_size=1, stride=1, output_dim=4096) if apply_dropout: fc7 = drop(fc7, 0.5) return fc7
def shapes_convnet(input_batch, hidden_dim=64, output_dim=64, scope='shapes_convnet', reuse=None): # input_batch has shape [N, H_im, W_im, 3] with tf.variable_scope(scope, reuse=reuse): conv_1 = conv_relu('conv_1', input_batch, kernel_size=10, stride=10, output_dim=hidden_dim, padding='VALID') conv_2 = conv_relu('conv_2', conv_1, kernel_size=1, stride=1, output_dim=output_dim) return conv_2
def vgg_fc7_full_conv(input_batch, name, apply_dropout, reuse=None): pool5 = vgg_pool5(input_batch, name, reuse) with tf.variable_scope(name, reuse=reuse): # layer 6 fc6 = conv_relu('fc6', pool5, kernel_size=7, stride=1, output_dim=4096) if apply_dropout: fc6 = drop(fc6, 0.5) # layer 7 fc7 = conv_relu('fc7', fc6, kernel_size=1, stride=1, output_dim=4096) if apply_dropout: fc7 = drop(fc7, 0.5) return fc7
def rpn_net(conv5, im_info, name, feat_stride=16, anchor_scales=(8, 16, 32), phase='TEST'): with tf.variable_scope(name): # rpn_conv/3x3 rpn_conv = conv_relu('rpn_conv/3x3', conv5, kernel_size=3, stride=1, output_dim=512) # rpn_cls_score # Note that we've already subtracted the bg weights from fg weights # and do sigmoid instead of softmax (actually sigmoid is not needed # for ranking) rpn_cls_score = conv('rpn_cls_score', rpn_conv, kernel_size=1, stride=1, output_dim=len(anchor_scales) * 3) # rpn_bbox_pred rpn_bbox_pred = conv('rpn_bbox_pred', rpn_conv, kernel_size=1, stride=1, output_dim=len(anchor_scales) * 3 * 4) rois = tf.py_func(ProposalLayer(feat_stride, anchor_scales, phase), [rpn_cls_score, rpn_bbox_pred, im_info], [tf.float32], stateful=False)[0] rois.set_shape([None, 5]) return rois
def text_objseg_full_conv(text_seq_batch, imcrop_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature feat_vis = vgg_net.vgg_fc8_full_conv(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout) # Reshape and tile LSTM top featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3] N, D_text = feat_lang.get_shape().as_list() feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]), [1, featmap_H, featmap_W, 1]) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 3 (channel dimension) spatial_batch = tf.convert_to_tensor(generate_spatial_batch(N, featmap_H, featmap_W)) feat_all = tf.concat(3, [tf.nn.l2_normalize(feat_lang, 3), tf.nn.l2_normalize(feat_vis, 3), spatial_batch]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = conv_relu('mlp_l1', feat_all, kernel_size=1, stride=1, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1) return mlp_l2
def forward(self, imcrop_batch, text_seq_batch, is_training=True): num_vocab, embed_dim, lstm_dim, mlp_hidden_dims = self.num_vocab, self.embed_dim, self.lstm_dim, self.mlp_hidden_dims deeplab_dropout = self.kwargs[ 'deeplab_dropout'] if 'deeplab_dropout' in self.kwargs else False mlp_dropout = self.kwargs[ 'mlp_dropout'] if 'mlp_dropout' in self.kwargs else False with tf.variable_scope(self.model_name): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)[0] # Local image feature feat_vis = deeplab.deeplab_fc8_full_conv( imcrop_batch, 'deeplab', apply_dropout=deeplab_dropout) # Reshape and tile LSTM top featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3] N, D_text = feat_lang.get_shape().as_list() feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]), [1, featmap_H, featmap_W, 1]) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 3 (channel dimension) spatial_batch = tf.convert_to_tensor( generate_spatial_batch(N, featmap_H, featmap_W)) feat_all = tf.concat(axis=3, values=[ tf.nn.l2_normalize(feat_lang, 3), tf.nn.l2_normalize(feat_vis, 3), spatial_batch ]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = conv_relu('mlp_l1', feat_all, kernel_size=1, stride=1, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1) upsample8s = deconv('upsample8s', mlp_l2, kernel_size=16, stride=8, output_dim=1, bias_term=False) return upsample8s
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False): with tf.variable_scope(name): if reuse==True: print name+" reuse variables" tf.get_variable_scope().reuse_variables() else: print name+" doesn't reuse variables" layer1 = conv_relu('layer1', input_batch, kernel_size=1,stride=1,output_dim=middle_layer_dim) sim_score = conv('layer2', layer1, kernel_size=1,stride=1,output_dim=3) return sim_score
def text_objseg_full_conv(text_seq_batch, imcrop_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, deeplab_dropout, mlp_dropout, is_training): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)[0] #deeplab101 net = deeplab101.DeepLabResNetModel({'data': imcrop_batch}, is_training=is_training) feat_vis = net.layers['fc1_voc12'] # # Local image feature # feat_vis = deeplab.deeplab_fc8_full_conv(imcrop_batch, 'deeplab', # apply_dropout=deeplab_dropout) # Reshape and tile LSTM top featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3] N, D_text = feat_lang.get_shape().as_list() feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]), [1, featmap_H, featmap_W, 1]) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 3 (channel dimension) spatial_batch = tf.convert_to_tensor( generate_spatial_batch(N, featmap_H, featmap_W)) feat_all = tf.concat(axis=3, values=[ tf.nn.l2_normalize(feat_lang, 3), tf.nn.l2_normalize(feat_vis, 3), spatial_batch ]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = conv_relu('mlp_l1', feat_all, kernel_size=1, stride=1, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1) return mlp_l2
def vgg_conv5(input_batch, name, reuse=None): with tf.variable_scope(name, reuse=reuse): # layer 1 conv1_1 = conv_relu('conv1_1', input_batch, kernel_size=3, stride=1, output_dim=64) conv1_2 = conv_relu('conv1_2', conv1_1, kernel_size=3, stride=1, output_dim=64) pool1 = pool('pool1', conv1_2, kernel_size=2, stride=2) # layer 2 conv2_1 = conv_relu('conv2_1', pool1, kernel_size=3, stride=1, output_dim=128) conv2_2 = conv_relu('conv2_2', conv2_1, kernel_size=3, stride=1, output_dim=128) pool2 = pool('pool2', conv2_2, kernel_size=2, stride=2) # layer 3 conv3_1 = conv_relu('conv3_1', pool2, kernel_size=3, stride=1, output_dim=256) conv3_2 = conv_relu('conv3_2', conv3_1, kernel_size=3, stride=1, output_dim=256) conv3_3 = conv_relu('conv3_3', conv3_2, kernel_size=3, stride=1, output_dim=256) pool3 = pool('pool3', conv3_3, kernel_size=2, stride=2) # layer 4 conv4_1 = conv_relu('conv4_1', pool3, kernel_size=3, stride=1, output_dim=512) conv4_2 = conv_relu('conv4_2', conv4_1, kernel_size=3, stride=1, output_dim=512) conv4_3 = conv_relu('conv4_3', conv4_2, kernel_size=3, stride=1, output_dim=512) pool4 = pool('pool4', conv4_3, kernel_size=2, stride=2) # layer 5 conv5_1 = conv_relu('conv5_1', pool4, kernel_size=3, stride=1, output_dim=512) conv5_2 = conv_relu('conv5_2', conv5_1, kernel_size=3, stride=1, output_dim=512) conv5_3 = conv_relu('conv5_3', conv5_2, kernel_size=3, stride=1, output_dim=512) return conv5_3
def vgg_pool5(input_batch, name): with tf.variable_scope(name): # layer 1 conv1_1 = conv_relu('conv1_1', input_batch, kernel_size=3, stride=1, output_dim=64) conv1_2 = conv_relu('conv1_2', conv1_1, kernel_size=3, stride=1, output_dim=64) pool1 = pool('pool1', conv1_2, kernel_size=2, stride=2) # layer 2 conv2_1 = conv_relu('conv2_1', pool1, kernel_size=3, stride=1, output_dim=128) conv2_2 = conv_relu('conv2_2', conv2_1, kernel_size=3, stride=1, output_dim=128) pool2 = pool('pool2', conv2_2, kernel_size=2, stride=2) # layer 3 conv3_1 = conv_relu('conv3_1', pool2, kernel_size=3, stride=1, output_dim=256) conv3_2 = conv_relu('conv3_2', conv3_1, kernel_size=3, stride=1, output_dim=256) conv3_3 = conv_relu('conv3_3', conv3_2, kernel_size=3, stride=1, output_dim=256) pool3 = pool('pool3', conv3_3, kernel_size=2, stride=2) # layer 4 conv4_1 = conv_relu('conv4_1', pool3, kernel_size=3, stride=1, output_dim=512) conv4_2 = conv_relu('conv4_2', conv4_1, kernel_size=3, stride=1, output_dim=512) conv4_3 = conv_relu('conv4_3', conv4_2, kernel_size=3, stride=1, output_dim=512) pool4 = pool('pool4', conv4_3, kernel_size=2, stride=2) # layer 5 conv5_1 = conv_relu('conv5_1', pool4, kernel_size=3, stride=1, output_dim=512) conv5_2 = conv_relu('conv5_2', conv5_1, kernel_size=3, stride=1, output_dim=512) conv5_3 = conv_relu('conv5_3', conv5_2, kernel_size=3, stride=1, output_dim=512) pool5 = pool('pool5', conv5_3, kernel_size=2, stride=2) return pool5
def __init__(self,input_batch, masking_batch, grad_scaling, scope='vgg_net', reuse=None): with tf.variable_scope(scope, reuse=reuse): # Adversarial layer W_adv = tf.get_variable('adv_weights', [1] + input_shape,initializer=tf.contrib.layers.xavier_initializer()) # channel mean is not subtracted in input_batch so it's the original masked image W_adv1 = tf.where(masking_batch,tf.zeros(tf.shape(W_adv),dtype=tf.float32),W_adv) W_adv2 = tf.where(tf.greater(input_batch+grad_scaling*W_adv1,255.0), (1./grad_scaling)*(255.0 - input_batch), W_adv1) W_adv3 = tf.where(tf.less(input_batch+grad_scaling*W_adv2,0.0), (-1./grad_scaling)*input_batch , W_adv2) # we subtract the channel mean before passing it to vgg16 adv_img = input_batch + grad_scaling*W_adv3 - channel_mean self.W_adv3 = W_adv3 self.adv_input = adv_img # layer 1 conv1_1 = conv_relu('conv1_1', adv_img, kernel_size=3, stride=1, output_dim=64) conv1_2 = conv_relu('conv1_2', conv1_1, kernel_size=3, stride=1, output_dim=64) pool1 = pool('pool1', conv1_2, kernel_size=2, stride=2) # layer 2 conv2_1 = conv_relu('conv2_1', pool1, kernel_size=3, stride=1, output_dim=128) conv2_2 = conv_relu('conv2_2', conv2_1, kernel_size=3, stride=1, output_dim=128) pool2 = pool('pool2', conv2_2, kernel_size=2, stride=2) # layer 3 conv3_1 = conv_relu('conv3_1', pool2, kernel_size=3, stride=1, output_dim=256) conv3_2 = conv_relu('conv3_2', conv3_1, kernel_size=3, stride=1, output_dim=256) conv3_3 = conv_relu('conv3_3', conv3_2, kernel_size=3, stride=1, output_dim=256) pool3 = pool('pool3', conv3_3, kernel_size=2, stride=2) # layer 4 conv4_1 = conv_relu('conv4_1', pool3, kernel_size=3, stride=1, output_dim=512) conv4_2 = conv_relu('conv4_2', conv4_1, kernel_size=3, stride=1, output_dim=512) conv4_3 = conv_relu('conv4_3', conv4_2, kernel_size=3, stride=1, output_dim=512) pool4 = pool('pool4', conv4_3, kernel_size=2, stride=2) # layer 5 conv5_1 = conv_relu('conv5_1', pool4, kernel_size=3, stride=1, output_dim=512) conv5_2 = conv_relu('conv5_2', conv5_1, kernel_size=3, stride=1, output_dim=512) conv5_3 = conv_relu('conv5_3', conv5_2, kernel_size=3, stride=1, output_dim=512) pool5 = pool('pool5', conv5_3, kernel_size=2, stride=2) self.image_feat_grid = pool5