def test_convert_collection_to_dict_clear_collection(self):
   t1 = constant_op.constant(1.0, name='t1')
   t2 = constant_op.constant(2.0, name='t2')
   utils.collect_named_outputs('end_points', 'a1', t1)
   utils.collect_named_outputs('end_points', 'a21', t2)
   utils.collect_named_outputs('end_points', 'a22', t2)
   utils.convert_collection_to_dict('end_points', clear_collection=True)
   self.assertEqual(ops.get_collection('end_points'), [])
Exemple #2
0
def vgg_a(inputs,
          num_classes=1000,
          is_training=True,
          dropout_keep_prob=0.5,
          spatial_squeeze=True,
          scope='vgg_a'):
  """Oxford Net VGG 11-Layers version A Example.

  Note: All the fully_connected layers have been transformed to conv2d layers.
        To use in classification mode, resize input to 224x224.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes.
    is_training: whether or not the model is being trained.
    dropout_keep_prob: the probability that activations are kept in the dropout
      layers during training.
    spatial_squeeze: whether or not should squeeze the spatial dimensions of the
      outputs. Useful to remove unnecessary dimensions for classification.
    scope: Optional scope for the variables.

  Returns:
    the last op containing the log predictions and end_points dict.
  """
  with variable_scope.variable_scope(scope, 'vgg_a', [inputs]) as sc:
    end_points_collection = sc.original_name_scope + '_end_points'
    # Collect outputs for conv2d, fully_connected and max_pool2d.
    with arg_scope(
        [layers.conv2d, layers_lib.max_pool2d],
        outputs_collections=end_points_collection):
      net = layers_lib.repeat(
          inputs, 1, layers.conv2d, 64, [3, 3], scope='conv1')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool1')
      net = layers_lib.repeat(net, 1, layers.conv2d, 128, [3, 3], scope='conv2')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool2')
      net = layers_lib.repeat(net, 2, layers.conv2d, 256, [3, 3], scope='conv3')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool3')
      net = layers_lib.repeat(net, 2, layers.conv2d, 512, [3, 3], scope='conv4')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool4')
      net = layers_lib.repeat(net, 2, layers.conv2d, 512, [3, 3], scope='conv5')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool5')
      # Use conv2d instead of fully_connected layers.
      net = layers.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6')
      net = layers_lib.dropout(
          net, dropout_keep_prob, is_training=is_training, scope='dropout6')
      net = layers.conv2d(net, 4096, [1, 1], scope='fc7')
      net = layers_lib.dropout(
          net, dropout_keep_prob, is_training=is_training, scope='dropout7')
      net = layers.conv2d(
          net,
          num_classes, [1, 1],
          activation_fn=None,
          normalizer_fn=None,
          scope='fc8')
      # Convert end_points_collection into a end_point dict.
      end_points = utils.convert_collection_to_dict(end_points_collection)
      if spatial_squeeze:
        net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed')
        end_points[sc.name + '/fc8'] = net
      return net, end_points
 def _resnet_plain(self, inputs, blocks, output_stride=None, scope=None):
   """A plain ResNet without extra layers before or after the ResNet blocks."""
   with variable_scope.variable_scope(scope, values=[inputs]):
     with arg_scope([layers.conv2d], outputs_collections='end_points'):
       net = resnet_utils.stack_blocks_dense(inputs, blocks, output_stride)
       end_points = utils.convert_collection_to_dict('end_points')
       return net, end_points
Exemple #4
0
 def test_convert_collection_to_dict(self):
   t1 = constant_op.constant(1.0, name='t1')
   t2 = constant_op.constant(2.0, name='t2')
   utils.collect_named_outputs('end_points', 'a1', t1)
   utils.collect_named_outputs('end_points', 'a21', t2)
   utils.collect_named_outputs('end_points', 'a22', t2)
   end_points = utils.convert_collection_to_dict('end_points')
   self.assertEqual(end_points['a1'], t1)
   self.assertEqual(end_points['a21'], t2)
   self.assertEqual(end_points['a22'], t2)
  def body(self, features):
    hp = self.hparams
    # pylint: disable=eval-used
    if hp.image_input_type == "image":
      image_feat = vqa_layers.image_embedding(
          features["inputs"],
          model_fn=eval(hp.image_model_fn),
          trainable=hp.train_resnet,
          is_training=hp.mode == tf.estimator.ModeKeys.TRAIN)
    else:
      image_feat = features["inputs"]

    image_feat = common_layers.flatten4d3d(image_feat)
    # image feature self attention
    # image_feat = tf.nn.dropout(
    #     image_feat, keep_prob=1.-hp.layer_prepostprocess_dropout)

    # image_feat = image_feat - tf.reduce_mean(
    #     image_feat, axis=-1, keepdims=True)
    # image_feat = tf.nn.l2_normalize(image_feat, -1)
    # utils.collect_named_outputs("norms", "image_feat_after_l2",
    #                             tf.norm(image_feat, axis=-1))

    image_feat = tf.nn.dropout(image_feat, keep_prob=1.-hp.dropout)

    image_feat = image_encoder(image_feat, hp)
    utils.collect_named_outputs("norms", "image_feat_encoded",
                                tf.norm(image_feat, axis=-1))
    image_feat = common_layers.l2_norm(image_feat)
    utils.collect_named_outputs("norms", "image_feat_encoded_l2",
                                tf.norm(image_feat, axis=-1))

    query = question_encoder(features["question"], hp)
    utils.collect_named_outputs("norms", "query",
                                tf.norm(query, axis=-1))

    image_ave = attn(image_feat, query, hp)
    utils.collect_named_outputs("norms", "image_ave",
                                tf.norm(image_ave, axis=-1))

    image_question = tf.concat([image_ave, query], axis=1)
    utils.collect_named_outputs("norms", "image_question",
                                tf.norm(image_question, axis=-1))

    image_question = tf.nn.dropout(image_question, 1. - hp.dropout)

    output = mlp(image_question, hp)
    utils.collect_named_outputs("norms", "output",
                                tf.norm(output, axis=-1))

    norm_tensors = utils.convert_collection_to_dict("norms")
    vqa_layers.summarize_tensors(norm_tensors, tag="norms/")

    # Expand dimension 1 and 2
    return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)
  def body(self, features):
    hp = self.hparams
    # pylint: disable=eval-used
    if hp.image_input_type == "image":
      image_feat = vqa_layers.image_embedding(
          features["inputs"],
          model_fn=eval(hp.image_model_fn),
          trainable=hp.train_resnet,
          is_training=hp.mode == tf.estimator.ModeKeys.TRAIN)
    else:
      image_feat = features["inputs"]

    image_feat = common_layers.flatten4d3d(image_feat)
    image_feat = common_layers.dense(image_feat, hp.hidden_size)
    utils.collect_named_outputs("norms", "image_feat_after_proj",
                                tf.norm(image_feat, axis=-1))

    question = common_layers.flatten4d3d(features["question"])
    utils.collect_named_outputs("norms", "question_embedding",
                                tf.norm(question, axis=-1))
    (encoder_input, encoder_self_attention_bias,
     encoder_decoder_attention_bias) = prepare_image_question_encoder(
         image_feat, question, hp)

    encoder_input = tf.nn.dropout(
        encoder_input, keep_prob=1.-hp.layer_prepostprocess_dropout)

    encoder_output, _ = recurrent_transformer_decoder(
        encoder_input, None, encoder_self_attention_bias, None,
        hp, name="encoder")
    utils.collect_named_outputs(
        "norms", "encoder_output", tf.norm(encoder_output, axis=-1))

    # scale query by sqrt(hidden_size)
    query = tf.get_variable("query", [hp.hidden_size]) * hp.hidden_size **0.5
    query = tf.expand_dims(tf.expand_dims(query, axis=0), axis=0)
    batch_size = common_layers.shape_list(encoder_input)[0]
    query = tf.tile(query, [batch_size, 1, 1])
    query = tf.nn.dropout(
        query, keep_prob=1.-hp.layer_prepostprocess_dropout)

    decoder_output, _ = recurrent_transformer_decoder(
        query, encoder_output, None, encoder_decoder_attention_bias,
        hp, name="decoder")
    utils.collect_named_outputs("norms", "decoder_output",
                                tf.norm(decoder_output, axis=-1))

    norm_tensors = utils.convert_collection_to_dict("norms")
    vqa_layers.summarize_tensors(norm_tensors, tag="norms/")

    # Expand dimension 1 and 2
    return tf.expand_dims(decoder_output, axis=1)
Exemple #7
0
def vgg_16_tcomb(inputs,
           num_classes=1000,
           is_training=True,
           dropout_keep_prob=0.5,
           spatial_squeeze=True,
           scope='vgg_16_tcomb'):

  with variable_scope.variable_scope(scope, 'vgg_16_tcomb', [inputs]) as sc:
    end_points_collection = sc.original_name_scope + '_end_points'
    # Collect outputs for conv2d, fully_connected and max_pool2d.
    with arg_scope(
        [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d],
        outputs_collections=end_points_collection):
      net = init_conv_comb(inputs, 2, 64, [3, 3], 'conv1')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool1')

      net = repeat_conv_comb(net, 2, 64, 128, [3, 3], 'conv2')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool2')

      net = repeat_conv_comb(net, 3, 128, 256, [3, 3], 'conv3')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool3')

      net = repeat_conv_comb(net, 3, 256, 512, [3, 3], 'conv4')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool4')

      net = repeat_conv_comb(net, 3, 512, 512, [3, 3], 'conv5')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool5')

      # Use conv2d instead of fully_connected layers.
      net = layers.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6')
      net = layers_lib.dropout(
          net, dropout_keep_prob, is_training=is_training, scope='dropout6')
      net = layers.conv2d(net, 4096, [1, 1], scope='fc7')
      net = layers_lib.dropout(
          net, dropout_keep_prob, is_training=is_training, scope='dropout7')
      net = layers.conv2d(
          net,
          num_classes, [1, 1],
          activation_fn=None,
          normalizer_fn=None,
          scope='fc8')
      # Convert end_points_collection into a end_point dict.
      end_points = utils.convert_collection_to_dict(end_points_collection)
      if spatial_squeeze:
        net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed')
        end_points[sc.name + '/fc8'] = net
      return net, end_points
Exemple #8
0
def squeezenet(images,
               num_classes=1000,
               is_training=False,
               scope='squeezenet'):
    """Original squeezenet architecture for 227x227 images."""

    #DEBUG
    print('squeezenet: is_training is %d' % is_training)
    with tf.variable_scope('squeezenet', values=[images]) as sc:
        end_point_collection = sc.original_name_scope + '_end_points'
        with slim.arg_scope(
            [fire_module, myconv2d, slim.max_pool2d, slim.avg_pool2d],
                outputs_collections=[end_point_collection]):
            net = myconv2d(images,
                           64, [3, 3],
                           stride=2,
                           padding='VALID',
                           scope='conv1')
            net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool1')
            net = fire_module(net, 16, 64, scope='fire2')
            net = fire_module(net, 16, 64, scope='fire3')
            net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool3')
            net = fire_module(net, 32, 128, scope='fire4')
            net = fire_module(net, 32, 128, scope='fire5')
            net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool5')
            net = fire_module(net, 48, 192, scope='fire6')
            net = fire_module(net, 48, 192, scope='fire7')
            net = fire_module(net, 64, 256, scope='fire8')
            net = fire_module(net, 64, 256, scope='fire9')
            net = slim.dropout(net,
                               keep_prob=0.5,
                               is_training=is_training,
                               scope='drop9')
            net = myconv2d(net,
                           num_classes, [1, 1],
                           stride=1,
                           padding='VALID',
                           scope='conv10')
            net = slim.avg_pool2d(net, [13, 13],
                                  stride=1,
                                  padding='VALID',
                                  scope='avgpool10')
            logits = tf.squeeze(net, [1, 2], name='logits')
            logits = utils.collect_named_outputs(end_point_collection,
                                                 sc.name + '/logits', logits)
        end_points = utils.convert_collection_to_dict(end_point_collection)
        return logits, end_points
  def body(self, features):
    hp = self.hparams
    # pylint: disable=eval-used
    if hp.image_input_type == "image":
      image_feat = vqa_layers.image_embedding(
          features["inputs"],
          model_fn=eval(hp.image_model_fn),
          trainable=hp.train_resnet,
          is_training=hp.mode == tf.estimator.ModeKeys.TRAIN)
    else:
      image_feat = features["inputs"]

    if hp.image_feat_size:
      image_feat = common_layers.dense(image_feat, hp.image_feat_size)

    # apply layer normalization and dropout on image_feature
    utils.collect_named_outputs("norms", "image_feat_before_l2",
                                tf.norm(image_feat, axis=-1))
    image_feat = common_layers.l2_norm(image_feat)
    utils.collect_named_outputs("norms", "image_feat_after_l2",
                                tf.norm(image_feat, axis=-1))

    image_feat = tf.nn.dropout(image_feat, keep_prob=1.-hp.dropout)

    query = question_encoder(features["question"], hp)
    utils.collect_named_outputs("norms", "query",
                                tf.norm(query, axis=-1))

    image_ave = attn(image_feat, query, hp)
    utils.collect_named_outputs("norms", "image_ave",
                                tf.norm(image_ave, axis=-1))

    image_question = tf.concat([image_ave, query], axis=1)
    utils.collect_named_outputs("norms", "image_question",
                                tf.norm(image_question, axis=-1))

    image_question = tf.nn.dropout(image_question, 1. - hp.dropout)

    output = mlp(image_question, hp)
    utils.collect_named_outputs("norms", "output",
                                tf.norm(output, axis=-1))

    norm_tensors = utils.convert_collection_to_dict("norms")
    vqa_layers.summarize_tensors(norm_tensors, tag="norms/")

    # Expand dimension 1 and 2
    return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)
  def body(self, features):
    hp = self.hparams
    # pylint: disable=eval-used
    if hp.image_input_type == "image":
      image_feat = vqa_layers.image_embedding(
          features["inputs"],
          model_fn=eval(hp.image_model_fn),
          trainable=hp.train_resnet,
          is_training=hp.mode == tf.estimator.ModeKeys.TRAIN)
    else:
      image_feat = features["inputs"]

    if hp.image_feat_size:
      image_feat = common_layers.dense(image_feat, hp.image_feat_size)

    # apply layer normalization and dropout on image_feature
    utils.collect_named_outputs("norms", "image_feat_before_l2",
                                tf.norm(image_feat, axis=-1))
    image_feat = common_layers.l2_norm(image_feat)
    utils.collect_named_outputs("norms", "image_feat_after_l2",
                                tf.norm(image_feat, axis=-1))

    image_feat = tf.nn.dropout(image_feat, keep_prob=1.-hp.dropout)

    query = question_encoder(features["question"], hp)
    utils.collect_named_outputs("norms", "query",
                                tf.norm(query, axis=-1))

    image_ave = attn(image_feat, query, hp)
    utils.collect_named_outputs("norms", "image_ave",
                                tf.norm(image_ave, axis=-1))

    image_question = tf.concat([image_ave, query], axis=1)
    utils.collect_named_outputs("norms", "image_question",
                                tf.norm(image_question, axis=-1))

    image_question = tf.nn.dropout(image_question, 1. - hp.dropout)

    output = mlp(image_question, hp)
    utils.collect_named_outputs("norms", "output",
                                tf.norm(output, axis=-1))

    norm_tensors = utils.convert_collection_to_dict("norms")
    vqa_layers.summarize_tensors(norm_tensors, tag="norms/")

    # Expand dimension 1 and 2
    return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)
Exemple #11
0
    def build_net(self, inputs, is_training):
        """
        Net structure described in crnn paper
        feature_maps = [64, 128, 256, 256, 512, 512, 512]
        """
        norm_params = {
            'is_training': is_training,
            'decay': 0.9,
            'epsilon': 1e-05
        }

        with tf.variable_scope(self._scope, self._scope, [inputs]) as sc:
            end_points_collection = sc.name + '_end_points'

            with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.batch_norm],
                                outputs_collections=end_points_collection):
                # net = slim.conv2d(inputs, 64, 3, 1, scope='conv1')
                net = slim.conv2d(inputs, 32, 1, 1, scope='conv1')
                net = dwise_conv(net, k_h=3, k_w=3, padding='SAME', name='dwise1')

                net = slim.max_pool2d(net, 2, 2, scope='pool1')

                # net = slim.conv2d(net, 128, 3, 1, scope='conv2')
                net = slim.conv2d(net, 64, 1, 1, scope='conv2')
                net = dwise_conv(net, k_h=3, k_w=3, padding='SAME', name='dwise2')

                net = slim.max_pool2d(net, 2, 2, scope='pool2')

                # net = slim.conv2d(net, 256, 3, scope='conv3')
                net = slim.conv2d(net, 128, 1, 1, scope='conv3')
                net = dwise_conv(net, k_h=3, k_w=3, padding='SAME', name='dwise3')

                # net = slim.conv2d(net, 256, 3, scope='conv4')
                net = slim.conv2d(net, 128, 1, 1, scope='conv4')
                net = dwise_conv(net, k_h=3, k_w=3, padding='SAME', name='dwise4')

                net = slim.max_pool2d(net, 2, [2, 1], scope='pool3')

                net = slim.conv2d(net, 256, 3, normalizer_fn=slim.batch_norm, normalizer_params=norm_params,
                                  scope='conv5')    # 512
                net = slim.conv2d(net, 256, 3, normalizer_fn=slim.batch_norm, normalizer_params=norm_params,
                                  scope='conv6')
                net = slim.max_pool2d(net, 2, [2, 1], scope='pool4')
                net = slim.conv2d(net, 256, 2, padding='VALID', scope='conv7')

            self.end_points = utils.convert_collection_to_dict(end_points_collection)
            self.net = net
def resnet_v2(inputs,
              blocks,
              num_classes=None,
              global_pool=True,
              include_root_block=True,
              reuse=True,
              scope=None):
    # scope_name  default_name  variable
    with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc:
        # 创建集合名
        end_points_collection = sc.original_name_scope + '_end_points'
        # 收集多个end_points的方法
        with slim.arg_scope([slim.conv2d, bottleneck, stack_blocks_dense],
                            outputs_collections=end_points_collection):
            net = inputs
            if include_root_block:
                with slim.arg_scope([slim.conv2d],
                                    activation_fn=None,
                                    normalizer_fn=None):
                    net = conv2d_same(net, 64, 7, stride=2, scope='conv1')
                net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')

            net = stack_blocks_dense(net, blocks)
            net = slim.batch_norm(net,
                                  activation_fn=tf.nn.relu,
                                  scope='postnorm')
            if global_pool:
                # reduce_mean实现全局池化
                # batch_size [height width] channels -> batch_size 1 1 channels
                # reduce_mean实现全局池化 [1, 2]
                net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
                # 通过一维卷积代替全连接
            if num_classes is not None:
                # conv2d(inputs, num_classes, [1, 1], without activation and normalize) 一维卷积代替全连接
                net = slim.conv2d(net,
                                  num_classes, [1, 1],
                                  activation_fn=None,
                                  normalizer_fn=None,
                                  scope='logits')

            end_points = utils.convert_collection_to_dict(
                end_points_collection)
            if num_classes is not None:
                end_points['predictions'] = slim.softmax(net,
                                                         scope='predictions')
            return net, end_points
Exemple #13
0
def pixelwise_predictor(feat,
                        nc=3,
                        n_layers=1,
                        n_layerwise_steps=0,
                        skip_feat=None,
                        reuse=False,
                        is_training=True):
    """Predicts texture images and probilistic masks.

  Args:
    feat: B X H X W X C feature vectors
    nc: number of output channels
    n_layers: number of plane equations to predict (denoted as L)
    n_layerwise_steps: Number of independent per-layer up-conv steps
    skip_feat: List of features useful for skip connections. Used if lws>0.
    reuse: Whether to reuse weights from an already defined net
    is_training: whether batch_norm should be in train mode
  Returns:
    textures : L X B X H X W X nc.
  """
    with tf.variable_scope('pixelwise_pred', reuse=reuse) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        with slim.arg_scope([slim.conv2d],
                            normalizer_fn=None,
                            weights_regularizer=slim.l2_regularizer(0.05),
                            activation_fn=tf.nn.sigmoid,
                            outputs_collections=end_points_collection):
            preds = []
            for l in range(n_layers):
                with tf.variable_scope('upsample_' + str(l), reuse=reuse):
                    feat_l, _ = decoder_simple(feat,
                                               nconv=n_layerwise_steps,
                                               skip_feat=skip_feat,
                                               reuse=reuse,
                                               is_training=is_training)
                    pred = slim.conv2d(feat_l,
                                       nc, [3, 3],
                                       stride=1,
                                       scope='pred_' + str(l))
                    preds.append(pred)

            end_points = utils.convert_collection_to_dict(
                end_points_collection)
            preds = tf.stack(preds, axis=0)

            return preds, end_points
def resnet_v2(inputs,
              blocks,
              num_classes=None,
              global_pool=True,
              include_root_block=True,
              reuse=None,
              scope=None):
    with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        with slim.arg_scope([slim.conv2d, bottleneck, stack_blocks_dense],
                            outputs_collections=end_points_collection):

            net = inputs
            if include_root_block:  # 根据标记值,创建resnet最前面的64输出通道的步长为2的7*7卷积,然后接最大池化
                with slim.arg_scope([slim.conv2d],
                                    activation_fn=None,
                                    normalizer_fn=None):
                    net = conv2d_same(net, 64, 7, stride=2, scope='conv1')
                net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')
                # 经历过两个步长为2的层图片缩为1/4

            net = stack_blocks_dense(net, blocks)  # 将残差学习模块组生成好
            net = slim.batch_norm(net,
                                  activation_fn=tf.nn.relu,
                                  scope='postnorm')

            if global_pool:
                net = tf.reduce_mean(
                    net, [1, 2], name='pool5',
                    keep_dims=True)  # tf.reduce_mean实现全局平均池化效率比avg_pool高

            if num_classes is not None:  # 是否有通道数
                net = slim.conv2d(
                    net,
                    num_classes,
                    [1, 1],
                    activation_fn=None,  # 无激活函数和正则项
                    normalizer_fn=None,
                    scope='logits')  # 添加一个输出通道num_classes的1*1的卷积
            end_points = utils.convert_collection_to_dict(
                end_points_collection)  # 将collection转化为python的dict

            if num_classes is not None:
                end_points['predictions'] = slim.softmax(
                    net, scope='predictions')  # 输出网络结果
            return net, end_points
Exemple #15
0
    def get_outputs(self, blobs, output_layers, sess, collect_metadata=True):
        feed_dict = {
            self._image: blobs['data'],
            self._im_info: blobs['im_info'],
            self._gt_boxes: np.zeros((10, 5))
        }
        fetches = {}

        for collection_name in ops.get_all_collection_keys():
            if self._resnet_scope in collection_name:
                collection_dict = utils.convert_collection_to_dict(
                    collection_name)
                for alias, tensor in collection_dict.items():
                    alias = remove_net_suffix(alias, self._resnet_scope)
                    for output_layer in output_layers:
                        if output_layer.net_layer(self._resnet_scope) in alias:
                            fetches[output_layer] = tensor

#     with timer('get_outputs sess.run'):
# Run the graph with full trace option
        if collect_metadata:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
            outputs = sess.run(fetches,
                               feed_dict=feed_dict,
                               options=run_options,
                               run_metadata=run_metadata)
        else:
            run_metadata = None
            outputs = sess.run(fetches, feed_dict=feed_dict)

        # Create the Timeline object, and write it to a json


#     tl = timeline.Timeline(run_metadata.step_stats)
#     ctf = tl.generate_chrome_trace_format()
#     with open('timeline.json', 'w') as f:
#       f.write(ctf)

#     outdir = osp.abspath(osp.join(cfg.ROOT_DIR, 'graph_defs'))
#     writer = tf.summary.FileWriter(logdir=outdir, graph=sess.graph)
#     writer.add_run_metadata(run_metadata, 'step1')
#     writer.flush()

        return outputs, run_metadata
Exemple #16
0
def resnet_v2(inputs,
              blocks,
              num_classes=None,
              global_pool=True,
              include_root_block=True,
              reuse=None,
              scope=None):
    with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc:
        end_point_collections = sc.original_name_scope + '_end_points'
        # 用slim.arg_scope将slim.conv2d bottleneck stack_blocks_dense 3个函数的参数outputs_collections设置为end_point_collections
        with slim.arg_scope([slim.conv2d, bottleneck, stack_block_dense],
                            outputs_collections=end_point_collections):
            net = inputs
            if include_root_block:
                # 根据include_root_block标记,创建resnet最前面一层的卷积神经网络
                with slim.arg_scope([slim.conv2d],
                                    activation_fn=None,
                                    normalizer_fn=None):
                    net = conv2d_same(net, 64, 7, stride=2, scope='conv1')
                net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1')
            # 利用stack_blocks_dense将残差学习模块完成
            net = stack_block_dense(net, blocks)
            net = slim.batch_norm(net,
                                  activation_fn=tf.nn.relu,
                                  scope='postnorm')
            if global_pool:
                # 根据标记添加平均池化层
                net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
            if num_classes is not None:
                # 根据是否有分类数,添加一个输出通道为num_classes的1*1卷积
                net = slim.conv2d(net,
                                  num_classes, [1, 1],
                                  activation_fn=None,
                                  normalizer_fn=None,
                                  scope='logits')
            # utils.convert_collection_to_dict将collection转化为dict
            end_points = utils.convert_collection_to_dict(
                end_point_collections)
            if num_classes is not None:
                # 添加一个softmax输出层
                end_points['prediction'] = slim.softmax(net,
                                                        scope='prediction')
            return net, end_points
Exemple #17
0
def encoder_simple(inp_img, nz=1000, is_training=True, reuse=False):
    """Creates a simple encoder CNN.

  Args:
    inp_img: TensorFlow node for input with size B X H X W X C
    nz: number of units in last layer, default=1000
    is_training: whether batch_norm should be in train mode
    reuse: Whether to reuse weights from an already defined net
  Returns:
    An encoder CNN which computes a final representation with nz
    units.
  """
    batch_norm_params = {'is_training': is_training}
    with tf.variable_scope('encoder', reuse=reuse) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        with slim.arg_scope([slim.conv2d, slim.fully_connected],
                            normalizer_fn=slim.batch_norm,
                            normalizer_params=batch_norm_params,
                            weights_regularizer=slim.l2_regularizer(0.05),
                            activation_fn=tf.nn.relu,
                            outputs_collections=end_points_collection):
            cnv1 = slim.conv2d(inp_img, 32, [7, 7], stride=2, scope='cnv1')
            cnv1b = slim.conv2d(cnv1, 32, [7, 7], stride=1, scope='cnv1b')
            cnv2 = slim.conv2d(cnv1b, 64, [5, 5], stride=2, scope='cnv2')
            cnv2b = slim.conv2d(cnv2, 64, [5, 5], stride=1, scope='cnv2b')
            cnv3 = slim.conv2d(cnv2b, 128, [3, 3], stride=2, scope='cnv3')
            cnv3b = slim.conv2d(cnv3, 128, [3, 3], stride=1, scope='cnv3b')
            cnv4 = slim.conv2d(cnv3b, 256, [3, 3], stride=2, scope='cnv4')
            cnv4b = slim.conv2d(cnv4, 256, [3, 3], stride=1, scope='cnv4b')
            cnv5 = slim.conv2d(cnv4b, 512, [3, 3], stride=2, scope='cnv5')
            cnv5b = slim.conv2d(cnv5, 512, [3, 3], stride=1, scope='cnv5b')
            cnv6 = slim.conv2d(cnv5b, 512, [3, 3], stride=2, scope='cnv6')
            cnv6b = slim.conv2d(cnv6, 512, [3, 3], stride=1, scope='cnv6b')
            cnv7 = slim.conv2d(cnv6b, 512, [3, 3], stride=2, scope='cnv7')
            cnv7b = slim.conv2d(cnv7, 512, [3, 3], stride=1, scope='cnv7b')
            cnv7b_flat = slim.flatten(cnv7b, scope='cnv7b_flat')
            enc = slim.stack(cnv7b_flat,
                             slim.fully_connected, [2 * nz, nz, nz],
                             scope='fc')

        end_points = utils.convert_collection_to_dict(end_points_collection)
        return enc, end_points
Exemple #18
0
def vgg16(inputs,
          num_classes=1000,
          is_training=True,
          dropout_keep_prob=0.5,
          spatial_squeeze=True,
          reuse=tf.AUTO_REUSE,
          scope='vgg_16'):
    with variable_scope.variable_scope(scope, 'vgg_16', [inputs],
                                       reuse=reuse) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        # Collect outputs for conv2d, fully_connected and max_pool2d.
        with arg_scope(
            [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d],
                outputs_collections=end_points_collection):
            net = layers_lib.repeat(inputs,
                                    2,
                                    layers.conv2d,
                                    64, [3, 3],
                                    scope='conv1')
            net = layers_lib.max_pool2d(net, [2, 2], scope='pool1')
            net = layers_lib.repeat(net,
                                    2,
                                    layers.conv2d,
                                    128, [3, 3],
                                    scope='conv2')
            net = layers_lib.max_pool2d(net, [2, 2], scope='pool2')
            net = layers_lib.repeat(net,
                                    3,
                                    layers.conv2d,
                                    256, [3, 3],
                                    scope='conv3')
            net = layers_lib.max_pool2d(net, [2, 2], scope='pool3')
            net = layers_lib.repeat(net,
                                    3,
                                    layers.conv2d,
                                    512, [3, 3],
                                    scope='conv4')
            net = layers_lib.max_pool2d(net, [2, 2], scope='pool4')

        # Convert end_points_collection into a end_point dict.
        end_points = utils.convert_collection_to_dict(end_points_collection)
        return net, end_points
Exemple #19
0
    def build_net(self, inputs):
        with tf.variable_scope(self._scope, self._scope, [inputs]) as sc:
            end_points_collection = sc.name + '_end_points'

            with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.batch_norm],
                                outputs_collections=end_points_collection):
                net = slim.conv2d(inputs, 96, [3, 3], scope='conv1')
                net = slim.max_pool2d(net, [2, 2], stride=2, scope='maxpool1')
                net = self.fire_module(net, 16, 64, scope='fire2')
                net = self.fire_module(net, 16, 64, scope='fire3')
                net = self.fire_module(net, 32, 128, scope='fire4')
                net = slim.max_pool2d(net, [2, 2], stride=2, scope='maxpool4')
                net = self.fire_module(net, 32, 128, scope='fire5')
                net = self.fire_module(net, 48, 192, scope='fire6')
                net = self.fire_module(net, 48, 192, scope='fire7')
                net = self.fire_module(net, 64, 256, scope='fire8')
                net = slim.max_pool2d(net, [2, 2], stride=[2, 1], scope='maxpool8')
                net = self.fire_module(net, 64, 256, scope='fire9')

        self.end_points = utils.convert_collection_to_dict(end_points_collection)
        self.net = net
def inference(inputs, batch_size, num_classes, training=True):
    with tf.variable_scope('inference') as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        with slim.arg_scope([slim.conv2d],
                            normalizer_fn=None,
                            weights_regularizer=slim.l2_regularizer(0.05),
                            activation_fn=tf.nn.relu,
                            trainable=training,
                            outputs_collections=end_points_collection):
            cnv1 = slim.conv2d(inputs, 16, [3, 3], stride=1, scope='cnv1')
            cnv2 = slim.conv2d(cnv1, 32, [1, 1], stride=1, scope='cnv2')
            max_pool1 = slim.max_pool2d(cnv2, [3, 3], stride=2, scope='maxpool1')
            cnv3 = slim.conv2d(max_pool1, 32, [3, 3], stride=1, scope='cnv3')
            cnv4 = slim.conv2d(cnv3, 64, [1, 1], stride=1, scope='cnv4')
            max_pool2 = slim.max_pool2d(cnv4, [3, 3], stride=2, scope='maxpool2')
            flat = slim.flatten(max_pool2, scope='flatten')
            fc_1 = slim.fully_connected(flat, 128, scope='fc_1', trainable=training)
            drop1 = slim.dropout(fc_1, scope='drop1', is_training=training)
            fc_2 = slim.fully_connected(drop1, num_classes, scope='fc_2', trainable=training)
            end_points = utils.convert_collection_to_dict(end_points_collection)
            return fc_2, end_points_collection
def resnet_v1(inputs,
              blocks,
              num_classes=None,
              is_training=True,
              global_pool=True,
              output_stride=None,
              include_root_block=True,
              reuse=None,
              scope=None):
  with variable_scope.variable_scope(
      scope, 'resnet_v1', [inputs], reuse=reuse) as sc:
    end_points_collection = sc.original_name_scope + '_end_points'
    with arg_scope(
        [layers.conv2d, not_bottleneck, resnet_utils.stack_blocks_dense],
        outputs_collections=end_points_collection):
      with arg_scope([layers.batch_norm], is_training=is_training):
        net = inputs
        if include_root_block:
          if output_stride is not None:
            if output_stride % 4 != 0:
              raise ValueError('The output_stride needs to be a multiple of 4.')
            output_stride /= 4
          net = resnet_utils.conv2d_same(net, 16, 3, stride=1, scope='conv1')
        net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
        if global_pool:
          # Global average pooling.
          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True)
        if num_classes is not None:
          net = layers.conv2d(
              net,
              num_classes, [1, 1],
              activation_fn=None,
              normalizer_fn=None,
              scope='logits')
        # Convert end_points_collection into a dictionary of end_points.
        end_points = utils.convert_collection_to_dict(end_points_collection)
        if num_classes is not None:
          end_points['predictions'] = layers_lib.softmax(
              net, scope='predictions')
        return net, end_points
Exemple #22
0
def squeezenet(images,
               is_training=True,
               batch_norm_decay=0.999,
               num_classes=1000):
    """Original squeezenet architecture for 224x224 images."""
    with slim.arg_scope(squeezenet_arg_scope(is_training, batch_norm_decay)):
        with tf.variable_scope('squeezenet', values=[images]) as sc:
            end_point_collection = sc.original_name_scope + '_end_points'
            with slim.arg_scope(
                [fire_module, slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                    outputs_collections=[end_point_collection]):
                net = slim.conv2d(images, 96, [7, 7], stride=2, scope='conv1')
                net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool1')
                net = fire_module(net, 16, 64, scope='fire2')
                net = fire_module(net, 16, 64, scope='fire3')
                net = fire_module(net, 32, 128, scope='fire4')
                net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool4')
                net = fire_module(net, 32, 128, scope='fire5')
                net = fire_module(net, 48, 192, scope='fire6')
                net = fire_module(net, 48, 192, scope='fire7')
                net = fire_module(net, 64, 256, scope='fire8')
                net = slim.max_pool2d(net, [3, 3], stride=2, scope='maxpool8')
                net = fire_module(net, 64, 256, scope='fire9')
                net = slim.dropout(net, is_training=is_training, scope='drop9')
                net = slim.conv2d(net,
                                  num_classes, [1, 1],
                                  stride=1,
                                  scope='conv10')
                net = slim.avg_pool2d(net, [13, 13],
                                      stride=1,
                                      scope='avgpool10')
                logits = tf.squeeze(net, [1, 2], name='logits')
                logits = utils.collect_named_outputs(end_point_collection,
                                                     sc.name + '/logits',
                                                     logits)
            end_points = utils.convert_collection_to_dict(end_point_collection)
            return logits, end_points
Exemple #23
0
def pose_net_fb(tgt_image, src_image_stack, is_training=True, reuse=False):
    inputs = tf.concat([tgt_image, src_image_stack], axis=3)
    H = inputs.get_shape()[1].value
    W = inputs.get_shape()[2].value
    num_source = int(src_image_stack.get_shape()[3].value // 3)
    with tf.variable_scope('pose_net') as sc:
        if reuse:
            sc.reuse_variables()
        end_points_collection = sc.original_name_scope + '_end_points'
        with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
                            normalizer_fn=None,
                            weights_regularizer=slim.l2_regularizer(0.05),
                            activation_fn=tf.nn.relu,
                            outputs_collections=end_points_collection):
            # cnv1 to cnv5b are shared between pose and explainability prediction
            cnv1 = slim.conv2d(inputs, 16, [7, 7], stride=2, scope='cnv1')
            cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2')
            cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3')
            cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4')
            cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5')
            cnv6 = slim.conv2d(cnv5, 256, [3, 3], stride=2, scope='cnv6')
            cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7')
            # Double the number of channels
            pose_pred = slim.conv2d(cnv7,
                                    6 * num_source * 2, [1, 1],
                                    scope='pred',
                                    stride=1,
                                    normalizer_fn=None,
                                    activation_fn=None)
            pose_avg = tf.reduce_mean(pose_pred, [1, 2])
            # Empirically we found that scaling by a small constant
            # facilitates training.
            # 1st half: target->source, 2nd half: source->target
            pose_final = 0.01 * tf.reshape(pose_avg, [-1, num_source, 6 * 2])
            end_points = utils.convert_collection_to_dict(
                end_points_collection)
            return pose_final, end_points
Exemple #24
0
def cifar_squeezenet(images,
                     is_training=True,
                     batch_norm_decay=0.999,
                     num_classes=10):
    """Modified version of squeezenet for CIFAR images"""
    with slim.arg_scope(squeezenet_arg_scope(is_training, batch_norm_decay)):
        with tf.variable_scope('squeezenet', values=[images]) as sc:
            end_point_collection = sc.original_name_scope + '_end_points'
            with slim.arg_scope(
                [fire_module, slim.conv2d, slim.max_pool2d, slim.avg_pool2d],
                    outputs_collections=[end_point_collection]):
                net = slim.conv2d(images, 96, [2, 2], scope='conv1')
                net = slim.max_pool2d(net, [2, 2], scope='maxpool1')
                net = fire_module(net, 16, 64, scope='fire2')
                net = fire_module(net, 16, 64, scope='fire3')
                net = fire_module(net, 32, 128, scope='fire4')
                net = slim.max_pool2d(net, [2, 2], scope='maxpool4')
                net = fire_module(net, 32, 128, scope='fire5')
                net = fire_module(net, 48, 192, scope='fire6')
                net = fire_module(net, 48, 192, scope='fire7')
                net = fire_module(net, 64, 256, scope='fire8')
                net = slim.max_pool2d(net, [2, 2], scope='maxpool8')
                net = fire_module(net, 64, 256, scope='fire9')
                # Use global average pooling per 'Network in Network [1]'
                net = slim.avg_pool2d(net, [4, 4], scope='avgpool10')
                net = slim.conv2d(net,
                                  num_classes, [1, 1],
                                  activation_fn=None,
                                  normalizer_fn=None,
                                  scope='conv10')
                logits = tf.squeeze(net, [1, 2], name='logits')
                logits = utils.collect_named_outputs(end_point_collection,
                                                     sc.name + '/logits',
                                                     logits)
            end_points = utils.convert_collection_to_dict(end_point_collection)
    return logits, end_points
Exemple #25
0
def get_slim_arch_bn(inputs, isTrainTensor, num_classes=1000, scope='vgg_16'):
    with variable_scope.variable_scope(scope, 'vgg_16', [inputs]) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        # Collect outputs for conv2d, fully_connected and max_pool2d.

        filters = 64

        # Arg scope set default parameters for a list of ops
        with arg_scope(
            [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d],
                outputs_collections=end_points_collection):
            net = layers_lib.repeat(
                inputs,
                2,
                layers.conv2d,
                filters, [3, 3],
                scope='conv1',
                weights_regularizer=slim.l2_regularizer(0.01))
            bn_0 = tf.contrib.layers.batch_norm(net,
                                                center=True,
                                                scale=True,
                                                is_training=isTrainTensor,
                                                scope='bn1',
                                                decay=0.9)
            p_0 = layers_lib.max_pool2d(bn_0, [2, 2], scope='pool1')

            net = layers_lib.repeat(
                p_0,
                2,
                layers.conv2d,
                filters, [3, 3],
                scope='conv2',
                weights_regularizer=slim.l2_regularizer(0.01))
            bn_1 = tf.contrib.layers.batch_norm(net,
                                                center=True,
                                                scale=True,
                                                is_training=isTrainTensor,
                                                scope='bn2',
                                                decay=0.9)
            res_1 = p_0 + bn_1
            p_1 = layers_lib.max_pool2d(res_1, [2, 2], scope='pool2')

            net = layers_lib.repeat(
                p_1,
                3,
                layers.conv2d,
                filters, [4, 4],
                scope='conv3',
                weights_regularizer=slim.l2_regularizer(0.01))
            bn_2 = tf.contrib.layers.batch_norm(net,
                                                center=True,
                                                scale=True,
                                                is_training=isTrainTensor,
                                                scope='bn3',
                                                decay=0.9)
            res_2 = p_1 + bn_2
            p_2 = layers_lib.max_pool2d(res_2, [2, 2], scope='pool3')

            net = layers_lib.repeat(
                p_2,
                3,
                layers.conv2d,
                filters, [5, 5],
                scope='conv4',
                weights_regularizer=slim.l2_regularizer(0.01))
            bn_3 = tf.contrib.layers.batch_norm(net,
                                                center=True,
                                                scale=True,
                                                is_training=isTrainTensor,
                                                scope='bn4',
                                                decay=0.9)
            res_3 = p_2 + bn_3
            p_3 = layers_lib.max_pool2d(res_3, [2, 2], scope='pool4')

            last_conv = net = layers_lib.repeat(
                p_3,
                3,
                layers.conv2d,
                filters, [5, 5],
                scope='conv5',
                weights_regularizer=slim.l2_regularizer(0.01))

            # Here we have 14x14 filters
            net = tf.reduce_mean(net, [1, 2])  # Global average pooling

            # add layer with float 32 mask of same shape as global average pooling out
            # feed default with ones, leave placeholder

            mask = tf.placeholder_with_default(tf.ones_like(net),
                                               shape=net.shape,
                                               name='gap_mask')
            net = tf.multiply(net, mask)

            net = layers_lib.fully_connected(net,
                                             num_classes,
                                             activation_fn=None,
                                             biases_initializer=None,
                                             scope='softmax_logits')

            with tf.variable_scope("raw_CAM"):
                w_tensor_name = "vgg_16/softmax_logits/weights:0"
                s_w = tf.get_default_graph().get_tensor_by_name(w_tensor_name)
                softmax_weights = tf.expand_dims(tf.expand_dims(s_w, 0),
                                                 0)  # reshape to match 1x1xFxC
                # tensor mult from (N x lh x lw x F) , (1 x 1 x F x C)
                cam = tf.tensordot(last_conv,
                                   softmax_weights, [[3], [2]],
                                   name='cam_out')

            # Convert end_points_collection into a end_point dict.
            end_points = utils.convert_collection_to_dict(
                end_points_collection)
            return net, end_points
Exemple #26
0
def resnet_v2(
        inputs, blocks, num_classes=None, is_training=True, global_pool=True, output_stride=None,
        include_root_block=True, centered_stride=False, reuse=None, scope=None):
    """Generator for v2 (preactivation) ResNet models.

    This function generates a family of ResNet v2 models. See the resnet_v2_*()
    methods for specific model instantiations, obtained by selecting different
    block instantiations that produce ResNets of various depths.

    Training for image classification on Imagenet is usually done with [224, 224]
    inputs, resulting in [7, 7] feature maps at the output of the last ResNet
    block for the ResNets defined in [1] that have nominal stride equal to 32.
    However, for dense prediction tasks we advise that one uses inputs with
    spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
    this case the feature maps at the ResNet output will have spatial shape
    [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
    and corners exactly aligned with the input image corners, which greatly
    facilitates alignment of the features to the image. Using as input [225, 225]
    images results in [8, 8] feature maps at the output of the last ResNet block.

    For dense prediction tasks, the ResNet needs to run in fully-convolutional
    (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
    have nominal stride equal to 32 and a good choice in FCN mode is to use
    output_stride=16 in order to increase the density of the computed features at
    small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.

    Args:
      inputs: A tensor of size [batch, height_in, width_in, channels].
      blocks: A list of length equal to the number of ResNet blocks. Each element
        is a resnet_utils.Block object describing the units in the block.
      num_classes: Number of predicted classes for classification tasks. If None
        we return the features before the logit layer.
      is_training: whether batch_norm layers are in training mode.
      global_pool: If True, we perform global average pooling before computing the
        logits. Set to True for image classification, False for dense prediction.
      output_stride: If None, then the output will be computed at the nominal
        network stride. If output_stride is not None, it specifies the requested
        ratio of input to output spatial resolution.
      include_root_block: If True, include the initial convolution followed by
        max-pooling, if False excludes it. If excluded, `inputs` should be the
        results of an activation-less convolution.
      reuse: whether or not the network and its variables should be reused. To be
        able to reuse 'scope' must be given.
      scope: Optional variable_scope.


    Returns:
      net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
        If global_pool is False, then height_out and width_out are reduced by a
        factor of output_stride compared to the respective height_in and width_in,
        else both height_out and width_out equal one. If num_classes is None, then
        net is the output of the last ResNet block, potentially after global
        average pooling. If num_classes is not None, net contains the pre-softmax
        activations.
      end_points: A dictionary from components of the network to the corresponding
        activation.

    Raises:
      ValueError: If the target output_stride is not valid.
    """

    with variable_scope.variable_scope(
            scope, 'resnet_v2', [inputs], reuse=reuse) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        with arg_scope(
                [layers_lib.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
                outputs_collections=end_points_collection):
            with arg_scope([layers.batch_norm], is_training=is_training):
                net = inputs
                if include_root_block:
                    if output_stride is not None:
                        if output_stride % 4 != 0:
                            raise ValueError('The output_stride needs to be a multiple of 4.')
                        output_stride /= 4
                    # We do not include batch normalization or activation functions in
                    # conv1 because the first ResNet unit will perform these. Cf.
                    # Appendix of [2].
                    with arg_scope([layers_lib.conv2d], activation_fn=None, normalizer_fn=None):
                        net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')

                    net = resnet_utils.max_pool2d_same(
                        net, 3, stride=2, scope='pool1',
                        centered_stride=centered_stride and output_stride == 4)
                net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
                # This is needed because the pre-activation variant does not have batch
                # normalization or activation functions in the residual unit output. See
                # Appendix of [2].
                net = slim.batch_norm(net, activation_fn=nn_ops.relu, scope='postnorm')
                if global_pool:
                    # Global average pooling.
                    net = math_ops.reduce_mean(net, tfu.image_axes(), name='pool5', keepdims=True)
                if num_classes is not None:
                    net = layers_lib.conv2d(
                        net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None,
                        scope='logits')
                # Convert end_points_collection into a dictionary of end_points.
                end_points = utils.convert_collection_to_dict(end_points_collection)
                if num_classes is not None:
                    end_points['predictions'] = layers.softmax(net, scope='predictions')
                return net, end_points
Exemple #27
0
def vgg_16(inputs,
           num_classes=1000,
           is_training=True,
           dropout_keep_prob=0.5,
           scope='vgg_16'):
    """Oxford Net VGG 16-Layers version D Example.

  Note: All the fully_connected layers have been transformed to conv2d layers.
        To use in classification mode, resize input to 224x224.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes.
    is_training: whether or not the model is being trained.
    dropout_keep_prob: the probability that activations are kept in the dropout
      layers during training.
    spatial_squeeze: whether or not should squeeze the spatial dimensions of the
      outputs. Useful to remove unnecessary dimensions for classification.
    scope: Optional scope for the variables.

  Returns:
    the last op containing the log predictions and end_points dict.
  """
    import tensorflow as tf
    inputs -= tf.constant([123.68, 116.779, 103.939])
    inputs /= 255
    with variable_scope.variable_scope(scope, 'vgg_16', [inputs]) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        # Collect outputs for conv2d, fully_connected and max_pool2d.
        with arg_scope(
            [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d],
                outputs_collections=end_points_collection):
            net = layers_lib.repeat(inputs,
                                    2,
                                    layers.conv2d,
                                    64, [3, 3],
                                    scope='conv1',
                                    trainable=False)
            net = layers_lib.max_pool2d(net, [2, 2],
                                        scope='pool1',
                                        padding="SAME")
            net = layers_lib.repeat(net,
                                    2,
                                    layers.conv2d,
                                    128, [3, 3],
                                    scope='conv2',
                                    trainable=False)
            net = layers_lib.max_pool2d(net, [2, 2],
                                        scope='pool2',
                                        padding="SAME")
            net = layers_lib.repeat(net,
                                    3,
                                    layers.conv2d,
                                    256, [3, 3],
                                    scope='conv3')
            net = layers_lib.max_pool2d(net, [2, 2],
                                        scope='pool3',
                                        padding="SAME")
            net = layers_lib.repeat(net,
                                    3,
                                    layers.conv2d,
                                    512, [3, 3],
                                    scope='conv4')
            net = layers_lib.max_pool2d(net, [2, 2],
                                        scope='pool4',
                                        padding="SAME")
            net = layers_lib.repeat(net,
                                    3,
                                    layers.conv2d,
                                    512, [3, 3],
                                    scope='conv5')
            net = layers_lib.max_pool2d(net, [2, 2], scope='pool5')
            # Use conv2d instead of fully_connected layers.
            # net = layers.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6')
            # net = layers_lib.dropout(
            #     net, dropout_keep_prob, is_training=is_training, scope='dropout6')
            # net = layers.conv2d(net, 4096, [1, 1], scope='fc7')
            # net = layers_lib.dropout(
            #     net, dropout_keep_prob, is_training=is_training, scope='dropout7')
            # net = layers.conv2d(
            #     net,
            #     num_classes, [1, 1],
            #     activation_fn=None,
            #     normalizer_fn=None,
            #     scope='fc8')
            # Convert end_points_collection into a end_point dict.
            end_points = utils.convert_collection_to_dict(
                end_points_collection)
            return end_points["vgg_16/conv5/conv5_3"]
Exemple #28
0
def vgg_a(inputs,
          num_classes=1000,
          is_training=True,
          dropout_keep_prob=0.5,
          spatial_squeeze=True,
          scope='vgg_a'):
    """Oxford Net VGG 11-Layers version A Example.

  Note: All the fully_connected layers have been transformed to conv2d layers.
        To use in classification mode, resize input to 224x224.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes.
    is_training: whether or not the model is being trained.
    dropout_keep_prob: the probability that activations are kept in the dropout
      layers during training.
    spatial_squeeze: whether or not should squeeze the spatial dimensions of the
      outputs. Useful to remove unnecessary dimensions for classification.
    scope: Optional scope for the variables.

  Returns:
    the last op containing the log predictions and end_points dict.
  """
    with variable_scope.variable_scope(scope, 'vgg_a', [inputs]) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        # Collect outputs for conv2d, fully_connected and max_pool2d.
        with arg_scope([layers.conv2d, layers_lib.max_pool2d],
                       outputs_collections=end_points_collection):
            net = layers_lib.repeat(inputs,
                                    1,
                                    layers.conv2d,
                                    64, [3, 3],
                                    scope='conv1',
                                    trainable=False)
            net = layers_lib.max_pool2d(net, [2, 2], scope='pool1')
            net = layers_lib.repeat(net,
                                    1,
                                    layers.conv2d,
                                    128, [3, 3],
                                    scope='conv2',
                                    trainable=False)
            net = layers_lib.max_pool2d(net, [2, 2], scope='pool2')
            net = layers_lib.repeat(net,
                                    2,
                                    layers.conv2d,
                                    256, [3, 3],
                                    scope='conv3')
            net = layers_lib.max_pool2d(net, [2, 2], scope='pool3')
            net = layers_lib.repeat(net,
                                    2,
                                    layers.conv2d,
                                    512, [3, 3],
                                    scope='conv4')
            net = layers_lib.max_pool2d(net, [2, 2], scope='pool4')
            net = layers_lib.repeat(net,
                                    2,
                                    layers.conv2d,
                                    512, [3, 3],
                                    scope='conv5')
            net = layers_lib.max_pool2d(net, [2, 2], scope='pool5')
            # Use conv2d instead of fully_connected layers.
            net = layers.conv2d(net,
                                4096, [7, 7],
                                padding='VALID',
                                scope='fc6')
            net = layers_lib.dropout(net,
                                     dropout_keep_prob,
                                     is_training=is_training,
                                     scope='dropout6')
            net = layers.conv2d(net, 4096, [1, 1], scope='fc7')
            net = layers_lib.dropout(net,
                                     dropout_keep_prob,
                                     is_training=is_training,
                                     scope='dropout7')
            net = layers.conv2d(net,
                                num_classes, [1, 1],
                                activation_fn=None,
                                normalizer_fn=None,
                                scope='fc8')
            # Convert end_points_collection into a end_point dict.
            end_points = utils.convert_collection_to_dict(
                end_points_collection)
            if spatial_squeeze:
                net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed')
                end_points[sc.name + '/fc8'] = net
            return net, end_points
Exemple #29
0
    def _build(self, inputs, is_training=True):
        """
        Args:
            inputs: A Tensor of shape `(batch_size, height, width, channels)`.

        Returns:
            A dict of feature maps to be consumed by an SSD network
        """
        # TODO: Is there a better way to manage scoping in these cases?
        scope = self.module_name
        if self.parent_name:
            scope = self.parent_name + "/" + scope

        base_net_endpoints = super(SSDFeatureExtractor, self)._build(
            inputs, is_training=is_training)["end_points"]

        if self.truncated_vgg_16_type:
            # As it is pointed out in SSD and ParseNet papers, `conv4_3` has a
            # different features scale compared to other layers, to adjust it
            # we need to add a spatial normalization before adding the
            # predictors.
            vgg_conv4_3 = base_net_endpoints[scope + "/vgg_16/conv4/conv4_3"]
            tf.summary.histogram("conv4_3_hist", vgg_conv4_3)
            with tf.variable_scope("conv_4_3_norm"):
                # Normalize through channels dimension (dim=3)
                vgg_conv4_3_norm = tf.nn.l2_normalize(vgg_conv4_3,
                                                      3,
                                                      epsilon=1e-12)
                # Scale.
                scale_initializer = (tf.ones([1, 1, 1, vgg_conv4_3.shape[3]]) *
                                     20.0)  # They initialize to 20.0 in paper
                scale = tf.get_variable(
                    "gamma",
                    dtype=vgg_conv4_3.dtype.base_dtype,
                    initializer=scale_initializer,
                )
                vgg_conv4_3_norm = tf.multiply(vgg_conv4_3_norm, scale)
                tf.summary.histogram("conv4_3_normalized_hist", vgg_conv4_3)
            tf.add_to_collection("FEATURE_MAPS", vgg_conv4_3_norm)

            # The original SSD paper uses a modified version of the vgg16
            # network, which we'll modify here
            vgg_network_truncation_endpoint = base_net_endpoints[
                scope + "/vgg_16/conv5/conv5_3"]
            tf.summary.histogram("conv5_3_hist",
                                 vgg_network_truncation_endpoint)

            # Extra layers for vgg16 as detailed in paper
            with tf.variable_scope("extra_feature_layers"):
                self._init_vgg16_extra_layers()
                net = tf.nn.max_pool(
                    vgg_network_truncation_endpoint,
                    [1, 3, 3, 1],
                    padding="SAME",
                    strides=[1, 1, 1, 1],
                    name="pool5",
                )
                net = self.conv6(net)
                net = self.activation_fn(net)
                net = self.conv7(net)
                net = self.activation_fn(net)
                tf.summary.histogram("conv7_hist", net)
                tf.add_to_collection("FEATURE_MAPS", net)
                net = self.conv8_1(net)
                net = self.activation_fn(net)
                net = self.conv8_2(net)
                net = self.activation_fn(net)
                tf.summary.histogram("conv8_hist", net)
                tf.add_to_collection("FEATURE_MAPS", net)
                net = self.conv9_1(net)
                net = self.activation_fn(net)
                net = self.conv9_2(net)
                net = self.activation_fn(net)
                tf.summary.histogram("conv9_hist", net)
                tf.add_to_collection("FEATURE_MAPS", net)
                net = self.conv10_1(net)
                net = self.activation_fn(net)
                net = self.conv10_2(net)
                net = self.activation_fn(net)
                tf.summary.histogram("conv10_hist", net)
                tf.add_to_collection("FEATURE_MAPS", net)
                net = self.conv11_1(net)
                net = self.activation_fn(net)
                net = self.conv11_2(net)
                net = self.activation_fn(net)
                tf.summary.histogram("conv11_hist", net)
                tf.add_to_collection("FEATURE_MAPS", net)

            # This parameter determines onto which variables we try to load the
            # pretrained weights
            self.pretrained_weights_scope = scope + "/vgg_16"

        # It's actually an ordered dict
        return utils.convert_collection_to_dict("FEATURE_MAPS")
  def body(self, features):
    hp = self.hparams
    # pylint: disable=eval-used
    if hp.image_input_type == "image":
      image_feat = vqa_layers.image_embedding(
          features["inputs"],
          model_fn=eval(hp.image_model_fn),
          trainable=hp.train_resnet,
          is_training=hp.mode == tf.estimator.ModeKeys.TRAIN)
    else:
      image_feat = features["inputs"]

    image_feat = common_layers.flatten4d3d(image_feat)
    image_hidden_size = hp.image_hidden_size or hp.hidden_size
    if hp.image_feat_preprocess_proj:
      image_feat = common_layers.dense(image_feat, image_hidden_size)
      utils.collect_named_outputs("norms", "image_feat_after_proj",
                                  tf.norm(image_feat, axis=-1))
    else:
      assert image_hidden_size == 2048

    image_feat = tf.nn.dropout(
        image_feat, keep_prob=1.-hp.layer_prepostprocess_dropout)

    if hp.image_feat_encode:
      image_feat = image_encoder(image_feat, hp)
      utils.collect_named_outputs("norms", "image_feat_encoded",
                                  tf.norm(image_feat, axis=-1))
    else:
      image_feat = common_layers.layer_norm(image_feat)
      utils.collect_named_outputs("norms", "image_feat_after_layer",
                                  tf.norm(image_feat, axis=-1))

    question = common_layers.flatten4d3d(features["question"])
    utils.collect_named_outputs("norms", "question_embedding",
                                tf.norm(question, axis=-1))
    question, question_self_attention_bias = prepare_question_encoder(
        question, hp)
    question = tf.nn.dropout(
        question, keep_prob=1.-hp.layer_prepostprocess_dropout)
    query = question_encoder(question, question_self_attention_bias, hp)
    utils.collect_named_outputs(
        "norms", "query_encode", tf.norm(query, axis=-1))
    query = (query + tf.expand_dims(
        tf.squeeze(question_self_attention_bias, [1, 2]), axis=2))
    query = tf.reduce_max(query, axis=1)
    utils.collect_named_outputs(
        "norms", "query_maxpool", tf.norm(query, axis=-1))

    # query = common_layers.l2_norm(query)
    # utils.collect_named_outputs("norms", "query_after_l2",
    #                             tf.norm(query, axis=-1))

    image_ave = attn(image_feat, query, hp)
    utils.collect_named_outputs("norms", "image_ave",
                                tf.norm(image_ave, axis=-1))

    if hp.multimodal_combine == "concat":
      image_question = tf.concat([image_ave, query], axis=1)
    elif hp.multimodal_combine == "sum":
      image_question = image_ave + query
    elif hp.multimodal_combine == "product":
      image_question = image_ave * query

    utils.collect_named_outputs("norms", "image_question",
                                tf.norm(image_question, axis=-1))

    image_question = tf.nn.dropout(image_question, 1. - hp.dropout)

    output = mlp(image_question, hp)
    utils.collect_named_outputs("norms", "output",
                                tf.norm(output, axis=-1))

    norm_tensors = utils.convert_collection_to_dict("norms")
    vqa_layers.summarize_tensors(norm_tensors, tag="norms/")

    # Expand dimension 1 and 2
    return tf.expand_dims(tf.expand_dims(output, axis=1), axis=2)
Exemple #31
0
def resnet_v1(inputs,
              blocks,
              num_classes=None,
              is_training=True,
              global_pool=True,
              include_root_block=True,
              reuse=None,
              scope=None,
              normalize_inside=True):
    """Removes output_stride, use pre-defined rate

    Returns:
      net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
        If global_pool is False, then height_out and width_out are reduced by a
        factor of output_stride compared to the respective height_in and width_in,
        else both height_out and width_out equal one. If num_classes is None, then
        net is the output of the last ResNet block, potentially after global
        average pooling. If num_classes is not None, net contains the pre-softmax
        activations.
      end_points: A dictionary from components of the network to the corresponding
        activation.

    Raises:
      ValueError: If the target output_stride is not valid.
    """
    if normalize_inside:
        # if no normalization is used outside, use detectron style normalization
        inputs = _detectron_img_preprocess(inputs)

    with variable_scope.variable_scope(scope,
                                       'resnet_v1', [inputs],
                                       reuse=reuse) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        with arg_scope([conv2d, bottleneck, stack_blocks_dense, max_pool2d],
                       outputs_collections=end_points_collection):
            with arg_scope([batch_norm], is_training=is_training):
                net = inputs
                if include_root_block:
                    # net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
                    net = conv2d(net, 64, 7, 2, scope='conv1')
                    net = max_pool2d(net, 3, 2, scope='pool1')
                net = stack_blocks_dense(net, blocks)
                if global_pool:
                    # Global average pooling.
                    net = math_ops.reduce_mean(net, [1, 2],
                                               name='pool5',
                                               keepdims=True)
                    net = utils.collect_named_outputs(end_points_collection,
                                                      sc.name + '/gap', net)

                if num_classes is not None:
                    net = conv2d(net,
                                 num_classes,
                                 1,
                                 activation_fn=None,
                                 normalizer_fn=None,
                                 scope='logits')
                # Convert end_points_collection into a dictionary of end_points.
                end_points = utils.convert_collection_to_dict(
                    end_points_collection)

                if num_classes is not None:
                    end_points['predictions'] = layers_lib.softmax(
                        net, scope='predictions')
                return net, end_points
Exemple #32
0
def disp_net(tgt_image, is_training=True):
    H = tgt_image.get_shape()[1].value
    W = tgt_image.get_shape()[2].value
    with tf.variable_scope('depth_net') as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
                            normalizer_fn=None,
                            weights_regularizer=slim.l2_regularizer(0.05),
                            activation_fn=tf.nn.relu,
                            outputs_collections=end_points_collection):
            # cnv1  = slim.conv2d(tgt_image, 32,  [7, 7], stride=2, scope='cnv1')
            cnv1  = slim.conv2d(tgt_image, 32,  [7, 7], stride=2, scope='cnv1')
            cnv1b = slim.conv2d(cnv1,  32,  [7, 7], stride=1, scope='cnv1b')
            cnv2  = slim.conv2d(cnv1b, 64,  [5, 5], stride=2, scope='cnv2')
            cnv2b = slim.conv2d(cnv2,  64,  [5, 5], stride=1, scope='cnv2b')
            cnv3  = slim.conv2d(cnv2b, 128, [3, 3], stride=2, scope='cnv3')
            cnv3b = slim.conv2d(cnv3,  128, [3, 3], stride=1, scope='cnv3b')
            cnv4  = slim.conv2d(cnv3b, 256, [3, 3], stride=2, scope='cnv4')
            cnv4b = slim.conv2d(cnv4,  256, [3, 3], stride=1, scope='cnv4b')
            cnv5  = slim.conv2d(cnv4b, 512, [3, 3], stride=2, scope='cnv5')
            cnv5b = slim.conv2d(cnv5,  512, [3, 3], stride=1, scope='cnv5b')
            cnv6  = slim.conv2d(cnv5b, 512, [3, 3], stride=2, scope='cnv6')
            cnv6b = slim.conv2d(cnv6,  512, [3, 3], stride=1, scope='cnv6b')
            cnv7  = slim.conv2d(cnv6b, 512, [3, 3], stride=2, scope='cnv7')
            cnv7b = slim.conv2d(cnv7,  512, [3, 3], stride=1, scope='cnv7b')

            upcnv7 = slim.conv2d_transpose(cnv7b, 512, [3, 3], stride=2, scope='upcnv7')
            # There might be dimension mismatch due to uneven down/up-sampling
            upcnv7 = resize_like(upcnv7, cnv6b)
            i7_in  = tf.concat([upcnv7, cnv6b], axis=3)
            icnv7  = slim.conv2d(i7_in, 512, [3, 3], stride=1, scope='icnv7')

            upcnv6 = slim.conv2d_transpose(icnv7, 512, [3, 3], stride=2, scope='upcnv6')
            upcnv6 = resize_like(upcnv6, cnv5b)
            i6_in  = tf.concat([upcnv6, cnv5b], axis=3)
            icnv6  = slim.conv2d(i6_in, 512, [3, 3], stride=1, scope='icnv6')

            upcnv5 = slim.conv2d_transpose(icnv6, 256, [3, 3], stride=2, scope='upcnv5')
            upcnv5 = resize_like(upcnv5, cnv4b)
            i5_in  = tf.concat([upcnv5, cnv4b], axis=3)
            icnv5  = slim.conv2d(i5_in, 256, [3, 3], stride=1, scope='icnv5')

            upcnv4 = slim.conv2d_transpose(icnv5, 128, [3, 3], stride=2, scope='upcnv4')
            upcnv4 = resize_like(upcnv4, cnv3b)
            i4_in  = tf.concat([upcnv4, cnv3b], axis=3)
            icnv4  = slim.conv2d(i4_in, 128, [3, 3], stride=1, scope='icnv4')
            disp4  = DISP_SCALING * slim.conv2d(icnv4, 1,   [3, 3], stride=1, 
                activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp4') + MIN_DISP
            disp4_up = tf.image.resize_bilinear(disp4, [np.int(H/4), np.int(W/4)])

            upcnv3 = slim.conv2d_transpose(icnv4, 64,  [3, 3], stride=2, scope='upcnv3')
            i3_in  = tf.concat([upcnv3, cnv2b, disp4_up], axis=3)
            icnv3  = slim.conv2d(i3_in, 64,  [3, 3], stride=1, scope='icnv3')
            disp3  = DISP_SCALING * slim.conv2d(icnv3, 1,   [3, 3], stride=1, 
                activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp3') + MIN_DISP
            disp3_up = tf.image.resize_bilinear(disp3, [np.int(H/2), np.int(W/2)])

            upcnv2 = slim.conv2d_transpose(icnv3, 32,  [3, 3], stride=2, scope='upcnv2')
            i2_in  = tf.concat([upcnv2, cnv1b, disp3_up], axis=3)
            icnv2  = slim.conv2d(i2_in, 32,  [3, 3], stride=1, scope='icnv2')
            disp2  = DISP_SCALING * slim.conv2d(icnv2, 1,   [3, 3], stride=1, 
                activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp2') + MIN_DISP
            disp2_up = tf.image.resize_bilinear(disp2, [H, W])

            upcnv1 = slim.conv2d_transpose(icnv2, 16,  [3, 3], stride=2, scope='upcnv1')
            i1_in  = tf.concat([upcnv1, disp2_up], axis=3)
            icnv1  = slim.conv2d(i1_in, 16,  [3, 3], stride=1, scope='icnv1')
            disp1  = DISP_SCALING * slim.conv2d(icnv1, 1,   [3, 3], stride=1, 
                activation_fn=tf.sigmoid, normalizer_fn=None, scope='disp1') + MIN_DISP
            
            end_points = utils.convert_collection_to_dict(end_points_collection)
            return [disp1, disp2, disp3, disp4], end_points
Exemple #33
0
    def get_inference(self,
                      inputs,
                      num_classes,
                      for_training=False,
                      restore_logits=True,
                      scope=None):
        """ Build model
      
    
      Args:
        images: Images returned from inputs() or distorted_inputs().
        num_classes: number of classes
        for_training: If set to `True`, build the inference model for training.
          Kernels that operate differently for inference during training
          e.g. dropout, are appropriately configured.
        restore_logits: whether or not the logits layers should be restored.
          Useful for fine-tuning a model with different num_classes.
        scope: optional prefix string identifying the ImageNet tower.
    
      Returns:
        Logits. 2-D float Tensor.
        Auxiliary Logits. 2-D float Tensor of side-head. Used for training only.
      """

        with variable_scope.variable_scope(scope, 'SegDecNet', [inputs]) as sc:
            end_points_collection = sc.original_name_scope + '_end_points'
            # Collect outputs for conv2d, max_pool2d
            with arg_scope([
                    layers.conv2d, layers.fully_connected,
                    layers_lib.max_pool2d, layers.batch_norm
            ],
                           outputs_collections=end_points_collection):

                # Apply specific parameters to all conv2d layers (to use batch norm and relu - relu is by default)
                with arg_scope(
                    [layers.conv2d, layers.fully_connected],
                        weights_initializer=lambda shape, dtype=tf.float32,
                        partition_info=None: tf.random_normal(
                            shape, mean=0, stddev=0.01, dtype=dtype),
                        biases_initializer=None,
                        normalizer_fn=layers.batch_norm,
                        normalizer_params=
                    {
                        'center': True,
                        'scale': True,
                        #'is_training': for_training, # we disable this to do feature normalization (but requires batch size=1)
                        'decay': self.
                        BATCHNORM_MOVING_AVERAGE_DECAY,  # Decay for the moving averages.
                        'epsilon': 0.001,  # epsilon to prevent 0s in variance.
                    }):

                    net = layers_lib.repeat(inputs,
                                            2,
                                            layers.conv2d,
                                            32, [5, 5],
                                            scope='conv1')

                    net = layers_lib.max_pool2d(net, [2, 2], scope='pool1')

                    net = layers_lib.repeat(net,
                                            3,
                                            layers.conv2d,
                                            64, [5, 5],
                                            scope='conv2')

                    net = layers_lib.max_pool2d(net, [2, 2], scope='pool2')

                    net = layers_lib.repeat(net,
                                            4,
                                            layers.conv2d,
                                            64, [5, 5],
                                            scope='conv3')

                    net = layers_lib.max_pool2d(net, [2, 2], scope='pool3')

                    net = layers.conv2d(net,
                                        1024, [15, 15],
                                        padding='SAME',
                                        scope='conv4')

                    net_prob_mat = layers.conv2d(net,
                                                 1, [1, 1],
                                                 scope='conv5',
                                                 activation_fn=None)

                    decision_net = self.decision_net_fn(
                        net, tf.nn.relu(net_prob_mat))

                    # Convert end_points_collection into a end_point dict.
                    endpoints = utils.convert_collection_to_dict(
                        end_points_collection)

        # Add summaries for viewing model statistics on TensorBoard.
        self._activation_summaries(endpoints)

        return net_prob_mat, decision_net, endpoints
Exemple #34
0
def alexnet_v2(inputs,
               num_classes=1000,
               is_training=True,
               dropout_keep_prob=0.5,
               spatial_squeeze=True,
               scope='alexnet_v2'):
  """AlexNet version 2.

  Described in: http://arxiv.org/pdf/1404.5997v2.pdf
  Parameters from:
  github.com/akrizhevsky/cuda-convnet2/blob/master/layers/
  layers-imagenet-1gpu.cfg

  Note: All the fully_connected layers have been transformed to conv2d layers.
        To use in classification mode, resize input to 224x224. To use in fully
        convolutional mode, set spatial_squeeze to false.
        The LRN layers have been removed and change the initializers from
        random_normal_initializer to xavier_initializer.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes.
    is_training: whether or not the model is being trained.
    dropout_keep_prob: the probability that activations are kept in the dropout
      layers during training.
    spatial_squeeze: whether or not should squeeze the spatial dimensions of the
      outputs. Useful to remove unnecessary dimensions for classification.
    scope: Optional scope for the variables.

  Returns:
    the last op containing the log predictions and end_points dict.
  """
  with variable_scope.variable_scope(scope, 'alexnet_v2', [inputs]) as sc:
    end_points_collection = sc.original_name_scope + '_end_points'
    # Collect outputs for conv2d, fully_connected and max_pool2d.
    with arg_scope(
        [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d],
        outputs_collections=[end_points_collection]):
      net = layers.conv2d(
          inputs, 64, [11, 11], 4, padding='VALID', scope='conv1')
      net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool1')
      net = layers.conv2d(net, 192, [5, 5], scope='conv2')
      net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool2')
      net = layers.conv2d(net, 384, [3, 3], scope='conv3')
      net = layers.conv2d(net, 384, [3, 3], scope='conv4')
      net = layers.conv2d(net, 256, [3, 3], scope='conv5')
      net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool5')

      # Use conv2d instead of fully_connected layers.
      with arg_scope(
          [layers.conv2d],
          weights_initializer=trunc_normal(0.005),
          biases_initializer=init_ops.constant_initializer(0.1)):
        net = layers.conv2d(net, 4096, [5, 5], padding='VALID', scope='fc6')
        net = layers_lib.dropout(
            net, dropout_keep_prob, is_training=is_training, scope='dropout6')
        net = layers.conv2d(net, 4096, [1, 1], scope='fc7')
        net = layers_lib.dropout(
            net, dropout_keep_prob, is_training=is_training, scope='dropout7')
        net = layers.conv2d(
            net,
            num_classes, [1, 1],
            activation_fn=None,
            normalizer_fn=None,
            biases_initializer=init_ops.zeros_initializer(),
            scope='fc8')

      # Convert end_points_collection into a end_point dict.
      end_points = utils.convert_collection_to_dict(end_points_collection)
      if spatial_squeeze:
        net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed')
        end_points[sc.name + '/fc8'] = net
      return net, end_points
    def body(self, features):
        hp = self.hparams
        # pylint: disable=eval-used
        if hp.image_input_type == "image":
            image_feat = vqa_layers.image_embedding(
                features["inputs"],
                model_fn=eval(hp.image_model_fn),
                trainable=hp.train_resnet,
                is_training=hp.mode == tf.estimator.ModeKeys.TRAIN)
        else:
            image_feat = features["inputs"]

        image_feat = common_layers.flatten4d3d(image_feat)
        image_feat = common_layers.dense(image_feat, hp.hidden_size)
        utils.collect_named_outputs("norms", "image_feat_after_proj",
                                    tf.norm(image_feat, axis=-1))

        question = common_layers.flatten4d3d(features["question"])
        utils.collect_named_outputs("norms", "question_embedding",
                                    tf.norm(question, axis=-1))
        (encoder_input, encoder_self_attention_bias,
         encoder_decoder_attention_bias) = prepare_image_question_encoder(
             image_feat, question, hp)

        encoder_input = tf.nn.dropout(encoder_input,
                                      keep_prob=1. -
                                      hp.layer_prepostprocess_dropout)

        encoder_output, _ = recurrent_transformer_decoder(
            encoder_input,
            None,
            encoder_self_attention_bias,
            None,
            hp,
            name="encoder")
        utils.collect_named_outputs("norms", "encoder_output",
                                    tf.norm(encoder_output, axis=-1))

        # scale query by sqrt(hidden_size)
        query = tf.get_variable("query",
                                [hp.hidden_size]) * hp.hidden_size**0.5
        query = tf.expand_dims(tf.expand_dims(query, axis=0), axis=0)
        batch_size = common_layers.shape_list(encoder_input)[0]
        query = tf.tile(query, [batch_size, 1, 1])
        query = tf.nn.dropout(query,
                              keep_prob=1. - hp.layer_prepostprocess_dropout)

        decoder_output, _ = recurrent_transformer_decoder(
            query,
            encoder_output,
            None,
            encoder_decoder_attention_bias,
            hp,
            name="decoder")
        utils.collect_named_outputs("norms", "decoder_output",
                                    tf.norm(decoder_output, axis=-1))

        norm_tensors = utils.convert_collection_to_dict("norms")
        vqa_layers.summarize_tensors(norm_tensors, tag="norms/")

        # Expand dimension 1 and 2
        return tf.expand_dims(decoder_output, axis=1)
Exemple #36
0
def resnet_v2(inputs,
              blocks,
              num_classes=None,
              is_training=None,
              global_pool=True,
              output_stride=None,
              include_root_block=True,
              reuse=None,
              scope=None):
  """Generator for v2 (preactivation) ResNet models.

  This function generates a family of ResNet v2 models. See the resnet_v2_*()
  methods for specific model instantiations, obtained by selecting different
  block instantiations that produce ResNets of various depths.

  Training for image classification on Imagenet is usually done with [224, 224]
  inputs, resulting in [7, 7] feature maps at the output of the last ResNet
  block for the ResNets defined in [1] that have nominal stride equal to 32.
  However, for dense prediction tasks we advise that one uses inputs with
  spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In
  this case the feature maps at the ResNet output will have spatial shape
  [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1]
  and corners exactly aligned with the input image corners, which greatly
  facilitates alignment of the features to the image. Using as input [225, 225]
  images results in [8, 8] feature maps at the output of the last ResNet block.

  For dense prediction tasks, the ResNet needs to run in fully-convolutional
  (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all
  have nominal stride equal to 32 and a good choice in FCN mode is to use
  output_stride=16 in order to increase the density of the computed features at
  small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915.

  Args:
    inputs: A tensor of size [batch, height_in, width_in, channels].
    blocks: A list of length equal to the number of ResNet blocks. Each element
      is a resnet_utils.Block object describing the units in the block.
    num_classes: Number of predicted classes for classification tasks. If None
      we return the features before the logit layer.
    is_training: whether is training or not. If None, the value inherited from
      the resnet_arg_scope is used. Specifying value None is deprecated.
    global_pool: If True, we perform global average pooling before computing the
      logits. Set to True for image classification, False for dense prediction.
    output_stride: If None, then the output will be computed at the nominal
      network stride. If output_stride is not None, it specifies the requested
      ratio of input to output spatial resolution.
    include_root_block: If True, include the initial convolution followed by
      max-pooling, if False excludes it. If excluded, `inputs` should be the
      results of an activation-less convolution.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.


  Returns:
    net: A rank-4 tensor of size [batch, height_out, width_out, channels_out].
      If global_pool is False, then height_out and width_out are reduced by a
      factor of output_stride compared to the respective height_in and width_in,
      else both height_out and width_out equal one. If num_classes is None, then
      net is the output of the last ResNet block, potentially after global
      average pooling. If num_classes is not None, net contains the pre-softmax
      activations.
    end_points: A dictionary from components of the network to the corresponding
      activation.

  Raises:
    ValueError: If the target output_stride is not valid.
  """
  with variable_scope.variable_scope(
      scope, 'resnet_v2', [inputs], reuse=reuse) as sc:
    end_points_collection = sc.original_name_scope + '_end_points'
    with arg_scope(
        [layers_lib.conv2d, bottleneck, resnet_utils.stack_blocks_dense],
        outputs_collections=end_points_collection):
      if is_training is not None:
        bn_scope = arg_scope([layers.batch_norm], is_training=is_training)
      else:
        bn_scope = arg_scope([])
      with bn_scope:
        net = inputs
        if include_root_block:
          if output_stride is not None:
            if output_stride % 4 != 0:
              raise ValueError('The output_stride needs to be a multiple of 4.')
            output_stride /= 4
          # We do not include batch normalization or activation functions in
          # conv1 because the first ResNet unit will perform these. Cf.
          # Appendix of [2].
          with arg_scope(
              [layers_lib.conv2d], activation_fn=None, normalizer_fn=None):
            net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1')
          net = layers.max_pool2d(net, [3, 3], stride=2, scope='pool1')
        net = resnet_utils.stack_blocks_dense(net, blocks, output_stride)
        # This is needed because the pre-activation variant does not have batch
        # normalization or activation functions in the residual unit output. See
        # Appendix of [2].
        net = layers.batch_norm(
            net, activation_fn=nn_ops.relu, scope='postnorm')
        if global_pool:
          # Global average pooling.
          net = math_ops.reduce_mean(net, [1, 2], name='pool5', keep_dims=True)
        if num_classes is not None:
          net = layers_lib.conv2d(
              net,
              num_classes, [1, 1],
              activation_fn=None,
              normalizer_fn=None,
              scope='logits')
        # Convert end_points_collection into a dictionary of end_points.
        end_points = utils.convert_collection_to_dict(end_points_collection)
        if num_classes is not None:
          end_points['predictions'] = layers.softmax(net, scope='predictions')
        return net, end_points
def alexnet_v2(inputs,
               num_classes=1000,
               is_training=True,
               dropout_keep_prob=0.5,
               spatial_squeeze=True,
               scope='alexnet_v2'):
    """AlexNet version 2.

      Described in: http://arxiv.org/pdf/1404.5997v2.pdf
      Parameters from:
      github.com/akrizhevsky/cuda-convnet2/blob/master/layers/
      layers-imagenet-1gpu.cfg

    Note: All the fully_connected layers have been transformed to conv2d layers.
            To use in classification mode, resize input to 224x224. To use in fully
            convolutional mode, set spatial_squeeze to false.
            The LRN layers have been removed and change the initializers from
            random_normal_initializer to xavier_initializer.

    Args:
        inputs: a tensor of size [batch_size, height, width, channels].
        num_classes: number of predicted classes.
        is_training: whether or not the model is being trained.
        dropout_keep_prob: the probability that activations are kept in the dropout
          layers during training.
        spatial_squeeze: whether or not should squeeze the spatial dimensions of the
          outputs. Useful to remove unnecessary dimensions for classification.
        scope: Optional scope for the variables.

    Returns:
        the last op containing the log predictions and end_points dict.
    """
    with name_scope(scope, 'alexnet_v2', [inputs]) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        # Collect outputs for conv2d, fully_connected and max_pool2d.
        with arg_scope(
            [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d],
                outputs_collections=[end_points_collection]):
            net = layers.conv2d(inputs,
                                64, [11, 11],
                                4,
                                padding='VALID',
                                scope='conv1')
            net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool1')
            net = layers.conv2d(net, 192, [5, 5], scope='conv2')
            net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool2')
            net = layers.conv2d(net, 384, [3, 3], scope='conv3')
            net = layers.conv2d(net, 384, [3, 3], scope='conv4')
            net = layers.conv2d(net, 256, [3, 3], scope='conv5')
            net = layers_lib.max_pool2d(net, [3, 3], 2, scope='pool5')

            # Use conv2d instead of fully_connected layers.
            with arg_scope(
                [layers.conv2d],
                    weights_initializer=trunc_normal(0.005),
                    biases_initializer=init_ops.constant_initializer(0.1)):
                net = layers.conv2d(net,
                                    4096, [2, 1],
                                    padding='VALID',
                                    scope='fc6')
                net = layers_lib.dropout(net,
                                         dropout_keep_prob,
                                         is_training=is_training,
                                         scope='dropout6')
                net = layers.conv2d(net, 4096, [1, 1], scope='fc7')
                net = layers_lib.dropout(net,
                                         dropout_keep_prob,
                                         is_training=is_training,
                                         scope='dropout7')
                net = layers.conv2d(
                    net,
                    num_classes, [1, 1],
                    activation_fn=None,
                    normalizer_fn=None,
                    biases_initializer=init_ops.zeros_initializer(),
                    scope='logits')

            # Convert end_points_collection into a end_point dict.
            end_points = utils.convert_collection_to_dict(
                end_points_collection)
            if spatial_squeeze:
                net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed')
                end_points[sc.name + '/fc8'] = net
            return net, end_points
Exemple #38
0
def decouple_net_v0_dilation(tgt_image,
                             src0_image,
                             src1_image,
                             dropout=False,
                             is_training=True,
                             se_attention=False,
                             batch_norm=False,
                             cnv6_num_outputs=128):
    """
    Input:
      flow_maps: centrelized.
    Return:
      pose_final = [rz,ry,rx,tx,ty,tz]
    """
    num_source = 2

    inputs = tf.concat([tgt_image, src0_image, src1_image], axis=3)
    print(">>> [PoseNN] inputs : ", inputs)
    print(">>> [PoseNN] use batch_norm : ",
          slim.batch_norm if batch_norm else None)
    print(">>> [PoseNN] cnv6_num_outputs = ", cnv6_num_outputs)

    with tf.variable_scope('pose_exp_net') as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        with slim.arg_scope(
            [slim.conv2d, slim.conv2d_transpose],
                normalizer_fn=slim.batch_norm if batch_norm else None,
                weights_regularizer=slim.l2_regularizer(1e-4),
                activation_fn=tf.nn.relu,
                outputs_collections=end_points_collection):
            # cnv1 to cnv5b are shared between pose and explainability prediction
            cnv1 = slim.conv2d(inputs, 16, [7, 7], stride=2, scope='cnv1')
            cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2')
            cnv3 = slim.conv2d(cnv2, 64, [3, 3], rate=2, scope='cnv3')
            cnv4 = slim.conv2d(cnv3, 128, [3, 3], rate=4, scope='cnv4')
            cnv5 = slim.conv2d(cnv4, 256, [3, 3], rate=8, scope='cnv5')
            if dropout:
                cnv5 = slim.dropout(cnv5, 0.7, is_training=is_training)
            # Pose specific layers
            cnv6s = {}
            poses_avg = {}
            with tf.variable_scope('pose'):
                for name in ['rotation', 'translation']:
                    with tf.variable_scope(name):
                        # cnv6 = tf.layers.conv2d(cnv5, 128, [3, 3], dilation_rate=(2, 2), padding='same', activation=tf.nn.relu, name='cnv6')
                        if se_attention is True:  # mode1
                            print(
                                ">>> [PoseNN][%s] use se_attention (insert se_block between cnv5 & cnv6)"
                                % name)
                            cnv5 = se_block(cnv5, 'cnv5_se_attention', ratio=8)
                            cnv6 = slim.conv2d(cnv5,
                                               cnv6_num_outputs, [3, 3],
                                               rate=2,
                                               scope='cnv6')
                        elif se_attention == 'se_skipadd':  # mode3
                            print(
                                ">>> [PoseNN][%s] use cnv5 + dilated_cnv6_se_attention"
                                % name)
                            cnv6 = slim.conv2d(cnv5,
                                               cnv6_num_outputs, [3, 3],
                                               rate=2,
                                               scope='cnv6')
                            se_cnv6 = se_block(cnv6,
                                               'cnv6_se_attention',
                                               ratio=8)
                            cnv6 = tf.nn.relu(
                                cnv5 + se_cnv6,
                                name="cnv6_se_attention/add_cnv5/relu")
                        elif se_attention == 'se_replace':  # mode2
                            print(
                                ">>> [PoseNN][%s] use se_attention replace with cnv6"
                                % name)
                            cnv6 = se_block(cnv5, 'cnv6_se_attention', ratio=8)
                        else:
                            cnv6 = slim.conv2d(cnv5,
                                               cnv6_num_outputs, [3, 3],
                                               rate=2,
                                               scope='cnv6')
                        cnv7 = slim.conv2d(cnv6,
                                           256, [3, 3],
                                           stride=2,
                                           scope='cnv7')
                        pred = slim.conv2d(cnv7,
                                           3 * num_source, [1, 1],
                                           scope='pred',
                                           stride=1,
                                           normalizer_fn=None,
                                           activation_fn=None)
                        avg = tf.reduce_mean(pred, [1, 2])
                        poses_avg[name] = avg
                        cnv6s[name] = cnv6
                # Empirically we found that scaling by a small constant facilitates training.
                rot_final = tf.reshape(poses_avg['rotation'],
                                       [-1, num_source, 3])
                trans_final = tf.reshape(poses_avg['translation'],
                                         [-1, num_source, 3])
                pose_final = 0.01 * tf.concat([rot_final, trans_final],
                                              axis=-1)  # -V4 : 2019/08/05
            # Exp mask specific layers
            end_points = utils.convert_collection_to_dict(
                end_points_collection)
            #return pose_final, end_points
            return pose_final, (cnv6s['rotation'], cnv6s['translation'])
Exemple #39
0
def vgg_a(inputs,
          num_classes=1000,
          is_training=True,
          dropout_keep_prob=0.5,
          spatial_squeeze=True,
          scope='vgg_a',
          fc_conv_padding='VALID',
          global_pool=False):
    """Oxford Net VGG 11-Layers version A Example.
  Note: All the fully_connected layers have been transformed to conv2d layers.
        To use in classification mode, resize input to 224x224.
  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes. If 0 or None, the logits layer is
      omitted and the input features to the logits layer are returned instead.
    is_training: whether or not the model is being trained.
    dropout_keep_prob: the probability that activations are kept in the dropout
      layers during training.
    spatial_squeeze: whether or not should squeeze the spatial dimensions of the
      outputs. Useful to remove unnecessary dimensions for classification.
    scope: Optional scope for the variables.
    fc_conv_padding: the type of padding to use for the fully connected layer
      that is implemented as a convolutional layer. Use 'SAME' padding if you
      are applying the network in a fully convolutional manner and want to
      get a prediction map downsampled by a factor of 32 as an output.
      Otherwise, the output prediction map will be (input / 32) - 6 in case of
      'VALID' padding.
    global_pool: Optional boolean flag. If True, the input to the classification
      layer is avgpooled to size 1x1, for any input size. (This is not part
      of the original VGG architecture.)
  Returns:
    net: the output of the logits layer (if num_classes is a non-zero integer),
      or the input to the logits layer (if num_classes is 0 or None).
    end_points: a dict of tensors with intermediate activations.
  """
    with tf.variable_scope(scope, 'vgg_a', [inputs]) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        # Collect outputs for conv2d, fully_connected and max_pool2d.
        with slim.arg_scope([slim.conv2d, slim.max_pool2d],
                            outputs_collections=end_points_collection):
            net = slim.repeat(inputs,
                              1,
                              slim.conv2d,
                              64, [3, 3],
                              scope='conv1')
            net = slim.max_pool2d(net, [2, 2], scope='pool1')
            net = slim.repeat(net, 1, slim.conv2d, 128, [3, 3], scope='conv2')
            net = slim.max_pool2d(net, [2, 2], scope='pool2')
            net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], scope='conv3')
            net = slim.max_pool2d(net, [2, 2], scope='pool3')
            net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv4')
            net = slim.max_pool2d(net, [2, 2], scope='pool4')
            net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv5')
            net = slim.max_pool2d(net, [2, 2], scope='pool5')

            # Use conv2d instead of fully_connected layers.
            net = slim.conv2d(net,
                              4096, [7, 7],
                              padding=fc_conv_padding,
                              scope='fc6')
            net = slim.dropout(net,
                               dropout_keep_prob,
                               is_training=is_training,
                               scope='dropout6')
            net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
            # Convert end_points_collection into a end_point dict.
            end_points = utils.convert_collection_to_dict(
                end_points_collection)
            if global_pool:
                net = tf.reduce_mean(net, [1, 2],
                                     keep_dims=True,
                                     name='global_pool')
                end_points['global_pool'] = net
            if num_classes:
                net = slim.dropout(net,
                                   dropout_keep_prob,
                                   is_training=is_training,
                                   scope='dropout7')
                net = slim.conv2d(net,
                                  num_classes, [1, 1],
                                  activation_fn=None,
                                  normalizer_fn=None,
                                  scope='fc8')
                if spatial_squeeze:
                    net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
                end_points[sc.name + '/fc8'] = net
            return net, end_points
Exemple #40
0
def pose_exp_net(tgt_image, src_image_stack, do_exp=True, is_training=True):
    inputs = tf.concat([tgt_image, src_image_stack], axis=3)
    H = inputs.get_shape()[1].value
    W = inputs.get_shape()[2].value
    num_source = int(src_image_stack.get_shape()[3].value // 3)
    with tf.variable_scope('pose_exp_net') as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        with slim.arg_scope([slim.conv2d, slim.conv2d_transpose],
                            normalizer_fn=None,
                            weights_regularizer=slim.l2_regularizer(0.05),
                            activation_fn=tf.nn.leaky_relu,
                            outputs_collections=end_points_collection):
            # cnv1 to cnv5b are shared between pose and explainability prediction
            cnv1 = slim.conv2d(inputs, 16, [7, 7], stride=2, scope='cnv1')
            cnv2 = slim.conv2d(cnv1, 32, [5, 5], stride=2, scope='cnv2')
            cnv3 = slim.conv2d(cnv2, 64, [3, 3], stride=2, scope='cnv3')
            cnv4 = slim.conv2d(cnv3, 128, [3, 3], stride=2, scope='cnv4')
            cnv5 = slim.conv2d(cnv4, 256, [3, 3], stride=2, scope='cnv5')

            # Pose specific layers
            with tf.variable_scope('pose'):
                cnv6 = slim.conv2d(cnv5, 256, [3, 3], stride=2, scope='cnv6')
                cnv7 = slim.conv2d(cnv6, 256, [3, 3], stride=2, scope='cnv7')
                pose_pred = slim.conv2d(cnv7,
                                        6 * num_source, [1, 1],
                                        scope='pred',
                                        stride=1,
                                        normalizer_fn=None,
                                        activation_fn=None)
                pose_avg = tf.reduce_mean(pose_pred, [1, 2])
                # Empirically we found that scaling by a small constant
                # facilitates training.
                pose_final = 0.01 * tf.reshape(pose_avg, [-1, num_source, 6])
            # Exp mask specific layers
            if do_exp:
                with tf.variable_scope('exp'):
                    upcnv5 = slim.conv2d_transpose(cnv5,
                                                   256, [3, 3],
                                                   stride=2,
                                                   scope='upcnv5')

                    upcnv4 = slim.conv2d_transpose(upcnv5,
                                                   128, [3, 3],
                                                   stride=2,
                                                   scope='upcnv4')
                    mask4 = slim.conv2d(upcnv4,
                                        num_source * 2, [3, 3],
                                        stride=1,
                                        scope='mask4',
                                        normalizer_fn=None,
                                        activation_fn=None)

                    upcnv3 = slim.conv2d_transpose(upcnv4,
                                                   64, [3, 3],
                                                   stride=2,
                                                   scope='upcnv3')
                    mask3 = slim.conv2d(upcnv3,
                                        num_source * 2, [3, 3],
                                        stride=1,
                                        scope='mask3',
                                        normalizer_fn=None,
                                        activation_fn=None)

                    upcnv2 = slim.conv2d_transpose(upcnv3,
                                                   32, [5, 5],
                                                   stride=2,
                                                   scope='upcnv2')
                    mask2 = slim.conv2d(upcnv2,
                                        num_source * 2, [5, 5],
                                        stride=1,
                                        scope='mask2',
                                        normalizer_fn=None,
                                        activation_fn=None)

                    upcnv1 = slim.conv2d_transpose(upcnv2,
                                                   16, [7, 7],
                                                   stride=2,
                                                   scope='upcnv1')
                    mask1 = slim.conv2d(upcnv1,
                                        num_source * 2, [7, 7],
                                        stride=1,
                                        scope='mask1',
                                        normalizer_fn=None,
                                        activation_fn=None)
            else:
                mask1 = None
                mask2 = None
                mask3 = None
                mask4 = None
            end_points = utils.convert_collection_to_dict(
                end_points_collection)
            return pose_final, [mask1, mask2, mask3, mask4], end_points
Exemple #41
0
def overfeat(inputs,
             num_classes=1000,
             is_training=True,
             dropout_keep_prob=0.5,
             spatial_squeeze=True,
             scope='overfeat'):
  """Contains the model definition for the OverFeat network.

  The definition for the network was obtained from:
    OverFeat: Integrated Recognition, Localization and Detection using
    Convolutional Networks
    Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus and
    Yann LeCun, 2014
    http://arxiv.org/abs/1312.6229

  Note: All the fully_connected layers have been transformed to conv2d layers.
        To use in classification mode, resize input to 231x231. To use in fully
        convolutional mode, set spatial_squeeze to false.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes.
    is_training: whether or not the model is being trained.
    dropout_keep_prob: the probability that activations are kept in the dropout
      layers during training.
    spatial_squeeze: whether or not should squeeze the spatial dimensions of the
      outputs. Useful to remove unnecessary dimensions for classification.
    scope: Optional scope for the variables.

  Returns:
    the last op containing the log predictions and end_points dict.

  """
  with variable_scope.variable_scope(scope, 'overfeat', [inputs]) as sc:
    end_points_collection = sc.name + '_end_points'
    # Collect outputs for conv2d, fully_connected and max_pool2d
    with arg_scope(
        [layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d],
        outputs_collections=end_points_collection):
      net = layers.conv2d(
          inputs, 64, [11, 11], 4, padding='VALID', scope='conv1')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool1')
      net = layers.conv2d(net, 256, [5, 5], padding='VALID', scope='conv2')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool2')
      net = layers.conv2d(net, 512, [3, 3], scope='conv3')
      net = layers.conv2d(net, 1024, [3, 3], scope='conv4')
      net = layers.conv2d(net, 1024, [3, 3], scope='conv5')
      net = layers_lib.max_pool2d(net, [2, 2], scope='pool5')
      with arg_scope(
          [layers.conv2d],
          weights_initializer=trunc_normal(0.005),
          biases_initializer=init_ops.constant_initializer(0.1)):
        # Use conv2d instead of fully_connected layers.
        net = layers.conv2d(net, 3072, [6, 6], padding='VALID', scope='fc6')
        net = layers_lib.dropout(
            net, dropout_keep_prob, is_training=is_training, scope='dropout6')
        net = layers.conv2d(net, 4096, [1, 1], scope='fc7')
        net = layers_lib.dropout(
            net, dropout_keep_prob, is_training=is_training, scope='dropout7')
        net = layers.conv2d(
            net,
            num_classes, [1, 1],
            activation_fn=None,
            normalizer_fn=None,
            biases_initializer=init_ops.zeros_initializer(),
            scope='fc8')
      # Convert end_points_collection into a end_point dict.
      end_points = utils.convert_collection_to_dict(end_points_collection)
      if spatial_squeeze:
        net = array_ops.squeeze(net, [1, 2], name='fc8/squeezed')
        end_points[sc.name + '/fc8'] = net
      return net, end_points