Beispiel #1
0
    def extract_features(self, preprocessed_inputs,
                         preprocessed_second_inputs):
        """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
        preprocessed_inputs.get_shape().assert_has_rank(4)
        shape_assert = tf.Assert(
            tf.logical_and(
                tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
                tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
            ['image size must at least be 33 in both height and width.'])

        feature_map_layout = {
            'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''],
            'layer_depth': [-1, -1, -1, 512, 256, 128],
        }

        #print("preprocessed_second_inputs", preprocessed_second_inputs.get_shape())

        with tf.control_dependencies([shape_assert]):
            with slim.arg_scope(self._conv_hyperparams):
                with tf.variable_scope('InceptionV3',
                                       reuse=self._reuse_weights) as scope:

                    audio_features = self.extract_audio_features(
                        preprocessed_second_inputs)

                    _, image_features = inception_v3.inception_v3_base(
                        ops.pad_to_multiple(preprocessed_inputs,
                                            self._pad_to_multiple),
                        #              audio_features['fc5'],
                        #              audio_features['flat'],
                        final_endpoint='Mixed_7c',
                        min_depth=self._min_depth,
                        depth_multiplier=self._depth_multiplier,
                        scope=scope)

                    #audio_features = self.extract_audio_features(preprocessed_second_inputs)

                    feature_maps = feature_map_generators.multi_resolution_feature_maps(
                        feature_map_layout=feature_map_layout,
                        depth_multiplier=self._depth_multiplier,
                        min_depth=self._min_depth,
                        insert_1x1_conv=True,
                        image_features=image_features)
                    #audio_features=audio_features)

        #return feature_maps.values()
        return feature_maps.values(), audio_features['fc4']
Beispiel #2
0
def inception_v3_ssd(img):
    with slim.arg_scope(inception_v3.inception_v3_arg_scope()):
        logits, end_point = inception_v3.inception_v3_base(img)
        vbs = slim.get_variables_to_restore()


    c1 = end_point['Mixed_5d']
    c2 = end_point['Mixed_6e']
    c3 = end_point['Mixed_7c']

    return c1,c2,c3,vbs
    def extract_features(self, preprocessed_inputs, audio_inputs=None):
        """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
        preprocessed_inputs = shape_utils.check_min_image_dim(
            33, preprocessed_inputs)

        feature_map_layout = {
            'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''],
            'layer_depth': [-1, -1, -1, 512, 256, 128],
            'use_explicit_padding': self._use_explicit_padding,
            'use_depthwise': self._use_depthwise,
        }

        with slim.arg_scope(self._conv_hyperparams_fn()):
            with tf.variable_scope('InceptionV3',
                                   reuse=self._reuse_weights) as scope:

                if (audio_inputs != None):
                    audio_features = self.extract_audio_features(audio_inputs)
                else:
                    audio_features = None

                _, image_features = inception_v3.inception_v3_base(
                    ops.pad_to_multiple(preprocessed_inputs,
                                        self._pad_to_multiple),
                    final_endpoint='Mixed_7c',
                    min_depth=self._min_depth,
                    depth_multiplier=self._depth_multiplier,
                    scope=scope)

                feature_maps = feature_map_generators.multi_resolution_feature_maps(
                    feature_map_layout=feature_map_layout,
                    depth_multiplier=self._depth_multiplier,
                    min_depth=self._min_depth,
                    insert_1x1_conv=True,
                    image_features=image_features)

                # A list of multi-resoulition feature maps
                # mixed_5d, mixed_6e, ...
                #print (len(feature_maps.values()))

        return feature_maps.values(), audio_features
  def extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs.get_shape().assert_has_rank(4)
    shape_assert = tf.Assert(
        tf.logical_and(tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
                       tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
        ['image size must at least be 33 in both height and width.'])

    feature_map_layout = {
        'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''],
        'layer_depth': [-1, -1, -1, 512, 256, 128],
    }

    with tf.control_dependencies([shape_assert]):
      with slim.arg_scope(self._conv_hyperparams):
        with tf.variable_scope('InceptionV3',
                               reuse=self._reuse_weights) as scope:
          _, image_features = inception_v3.inception_v3_base(
              ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
              final_endpoint='Mixed_7c',
              min_depth=self._min_depth,
              depth_multiplier=self._depth_multiplier,
              scope=scope)
          feature_maps = feature_map_generators.multi_resolution_feature_maps(
              feature_map_layout=feature_map_layout,
              depth_multiplier=self._depth_multiplier,
              min_depth=self._min_depth,
              insert_1x1_conv=True,
              image_features=image_features)

    return feature_maps.values()
    def _extract_proposal_features(self, preprocessed_inputs, scope):
        """Extracts first stage RPN features.

        Args:
          preprocessed_inputs: A [batch, height, width, channels] float32 tensor
            representing a batch of images.
          scope: A scope name.

        Returns:
          rpn_feature_map: A tensor with shape [batch, height, width, depth]
        Raises:
          InvalidArgumentError: If the spatial size of `preprocessed_inputs`
            (height or width) is less than 33.
          ValueError: If the created network is missing the required activation.
        """

        preprocessed_inputs.get_shape().assert_has_rank(4)
        shape_assert = tf.Assert(
            tf.logical_and(
                tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
                tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
            ['image size must at least be 33 in both height and width.'])

        with tf.control_dependencies([shape_assert]):
            with tf.variable_scope('InceptionV3',
                                   reuse=self._reuse_weights) as scope:
                with _batch_norm_arg_scope(
                    [slim.conv2d, slim.separable_conv2d],
                        batch_norm_scale=True,
                        train_batch_norm=self._train_batch_norm):
                    _, activations = inception_v3.inception_v3_base(
                        preprocessed_inputs,
                        final_endpoint='Mixed_6e',
                        min_depth=self._min_depth,
                        depth_multiplier=self._depth_multiplier,
                        scope=scope)

        return activations['Mixed_6e']
  def _extract_features(self, preprocessed_inputs):
    """Extract features from preprocessed inputs.

    Args:
      preprocessed_inputs: a [batch, height, width, channels] float tensor
        representing a batch of images.

    Returns:
      feature_maps: a list of tensors where the ith tensor has shape
        [batch, height_i, width_i, depth_i]
    """
    preprocessed_inputs = shape_utils.check_min_image_dim(
        33, preprocessed_inputs)

    feature_map_layout = {
        'from_layer': ['Mixed_5d', 'Mixed_6e', 'Mixed_7c', '', '', ''],
        'layer_depth': [-1, -1, -1, 512, 256, 128],
        'use_explicit_padding': self._use_explicit_padding,
        'use_depthwise': self._use_depthwise,
    }

    with slim.arg_scope(self._conv_hyperparams):
      with tf.variable_scope('InceptionV3', reuse=self._reuse_weights) as scope:
        _, image_features = inception_v3.inception_v3_base(
            ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
            final_endpoint='Mixed_7c',
            min_depth=self._min_depth,
            depth_multiplier=self._depth_multiplier,
            scope=scope)
        feature_maps = feature_map_generators.multi_resolution_feature_maps(
            feature_map_layout=feature_map_layout,
            depth_multiplier=self._depth_multiplier,
            min_depth=self._min_depth,
            insert_1x1_conv=True,
            image_features=image_features)

    return feature_maps.values()
Beispiel #7
0
def inception_v2_ssd(img, cfg):
    with slim.arg_scope(inception_v3.inception_v3_arg_scope()):
        logits, end_point = inception_v3.inception_v3_base(img)

        Mixed_5d = end_point['Mixed_5d']
        Mixed_6e = end_point['Mixed_6e']
        cell_11 = end_point['Mixed_7c']
        print(Mixed_5d, Mixed_6e, cell_11)
        vbs = slim.get_trainable_variables()
        #cell_11 = tf.image.resize_bilinear(cell_11,size=[32,32])
        cell_11 = Mixed_6e

        #cell_7 = tf.image.resize_bilinear(Mixed_6e,size=[64,64])
        cell_7 = Mixed_5d

    cell_11 = slim.conv2d(cell_11,
                          1024,
                          kernel_size=1,
                          activation_fn=slim.nn.relu)

    cell_7 = slim.conv2d(cell_7,
                         512,
                         kernel_size=3,
                         activation_fn=slim.nn.relu)
    cell_7 = slim.conv2d(cell_7,
                         256,
                         kernel_size=1,
                         activation_fn=slim.nn.relu)

    cv6 = slim.conv2d(cell_11,
                      1024,
                      kernel_size=3,
                      rate=6,
                      activation_fn=slim.nn.relu,
                      scope='conv6')
    cv7 = slim.conv2d(cv6,
                      1024,
                      kernel_size=1,
                      activation_fn=slim.nn.relu,
                      scope='conv7')

    s = utils.normalize_to_target(cell_7, target_norm_value=16.0, dim=1)

    cv8 = inception(cv7, out_put=512, name='cv8', stride=2)
    cv9 = inception(cv8, out_put=512, name='cv9', stride=2)
    cv10 = inception(cv9, out_put=512, name='cv10', stride=2)
    cv11 = inception(cv10, out_put=512, name='cv11', stride=2)

    source = [s, cv7, cv8, cv9, cv10, cv11]
    conf = []
    loc = []
    for cv, num in zip(source, cfg.Config['aspect_num']):
        print(num)
        loc.append(
            slim.conv2d(cv,
                        num * 4,
                        kernel_size=3,
                        stride=1,
                        activation_fn=None))

        conf.append(
            slim.conv2d(cv,
                        num * cfg.Config['num_classes'],
                        kernel_size=3,
                        stride=1,
                        activation_fn=None))
        print(loc)
    loc = tf.concat(
        [tf.reshape(o, shape=(cfg.batch_size, -1, 4)) for o in loc], axis=1)
    conf = tf.concat([
        tf.reshape(o, shape=(cfg.batch_size, -1, cfg.Config['num_classes']))
        for o in conf
    ],
                     axis=1)

    return loc, conf, vbs
Beispiel #8
0
def inception_v3(images,
                 trainable=True,
                 is_training=True,
                 weight_decay=0.00004,
                 stddev=0.1,
                 dropout_keep_prob=0.8,
                 use_batch_norm=True,
                 batch_norm_params=None,
                 add_summaries=True,
                 scope="InceptionV3"):
    """Builds an Inception V3 subgraph for image embeddings.
  Args:
    images: A float32 Tensor of shape [batch, height, width, channels].
    trainable: Whether the inception submodel should be trainable or not.
    is_training: Boolean indicating training mode or not.
    weight_decay: Coefficient for weight regularization.
    stddev: The standard deviation of the trunctated normal weight initializer.
    dropout_keep_prob: Dropout keep probability.
    use_batch_norm: Whether to use batch normalization.
    batch_norm_params: Parameters for batch normalization. See
      tf.contrib.layers.batch_norm for details.
    add_summaries: Whether to add activation summaries.
    scope: Optional Variable scope.
  Returns:
    end_points: A dictionary of activations from inception_v3 layers.
  """
    # Only consider the inception model1 to be in training mode if it's trainable.
    is_inception_model_training = trainable and is_training

    if use_batch_norm:
        # Default parameters for batch normalization.
        if not batch_norm_params:
            batch_norm_params = {
                "is_training": is_inception_model_training,
                "trainable": trainable,
                # Decay for the moving averages.
                "decay": 0.9997,
                # Epsilon to prevent 0s in variance.
                "epsilon": 0.001,
                # Collection containing the moving mean and moving variance.
                "variables_collections": {
                    "beta": None,
                    "gamma": None,
                    "moving_mean": ["moving_vars"],
                    "moving_variance": ["moving_vars"],
                }
            }
    else:
        batch_norm_params = None

    if trainable:
        weights_regularizer = tf.contrib.layers.l2_regularizer(weight_decay)
    else:
        weights_regularizer = None

    with tf.variable_scope(scope, "InceptionV3", [images],
                           reuse=tf.AUTO_REUSE) as scope:
        with slim.arg_scope([slim.conv2d, slim.fully_connected],
                            weights_regularizer=weights_regularizer,
                            trainable=trainable):
            with slim.arg_scope(
                [slim.conv2d],
                    weights_initializer=tf.truncated_normal_initializer(
                        stddev=stddev),
                    activation_fn=tf.nn.relu,
                    normalizer_fn=slim.batch_norm,
                    normalizer_params=batch_norm_params):
                net, end_points = inception_v3_base(images, scope=scope)
                with tf.variable_scope("logits"):
                    shape = net.get_shape()
                    net = slim.avg_pool2d(net,
                                          shape[1:3],
                                          padding="VALID",
                                          scope="pool")
                    net = slim.dropout(net,
                                       keep_prob=dropout_keep_prob,
                                       is_training=is_inception_model_training,
                                       scope="dropout")
                    net = slim.flatten(net, scope="flatten")

    # Add summaries.
    if add_summaries:
        for v in end_points.values():
            tf.contrib.layers.summaries.summarize_activation(v)

    return net, end_points
Beispiel #9
0
def inception_v3_fcn(inputs,
                     num_classes=1000,
                     is_training=True,
                     dropout_keep_prob=0.8,
                     min_depth=16,
                     depth_multiplier=1.0,
                     prediction_fn=slim.softmax,
                     spatial_squeeze=True,
                     reuse=None,
                     scope='InceptionV3'):
    """Inception model from http://arxiv.org/abs/1512.00567.

  "Rethinking the Inception Architecture for Computer Vision"

  Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens,
  Zbigniew Wojna.

  With the default arguments this method constructs the exact model defined in
  the paper. However, one can experiment with variations of the inception_v3
  network by changing arguments dropout_keep_prob, min_depth and
  depth_multiplier.

  The default image size used to train this network is 299x299.

  Args:
    inputs: a tensor of size [batch_size, height, width, channels].
    num_classes: number of predicted classes.
    is_training: whether is training or not.
    dropout_keep_prob: the percentage of activation values that are retained.
    min_depth: Minimum depth value (number of channels) for all convolution ops.
      Enforced when depth_multiplier < 1, and not an active constraint when
      depth_multiplier >= 1.
    depth_multiplier: Float multiplier for the depth (number of channels)
      for all convolution ops. The value must be greater than zero. Typical
      usage will be to set this value in (0, 1) to reduce the number of
      parameters or computation cost of the model.
    prediction_fn: a function to get predictions out of logits.
    spatial_squeeze: if True, logits is of shape [B, C], if false logits is
        of shape [B, 1, 1, C], where B is batch_size and C is number of classes.
    reuse: whether or not the network and its variables should be reused. To be
      able to reuse 'scope' must be given.
    scope: Optional variable_scope.

  Returns:
    logits: the pre-softmax activations, a tensor of size
      [batch_size, num_classes]
    end_points: a dictionary from components of the network to the corresponding
      activation.

  Raises:
    ValueError: if 'depth_multiplier' is less than or equal to zero.
  """
    if depth_multiplier <= 0:
        raise ValueError('depth_multiplier is not greater than zero.')
    depth = lambda d: max(int(d * depth_multiplier), min_depth)

    with tf.variable_scope(scope,
                           'InceptionV3', [inputs, num_classes],
                           reuse=reuse) as scope:
        with slim.arg_scope([slim.batch_norm, slim.dropout],
                            is_training=is_training):
            net, end_points = inception_v3.inception_v3_base(
                inputs,
                scope=scope,
                min_depth=min_depth,
                depth_multiplier=depth_multiplier)

            # Final pooling
            with tf.variable_scope('Logits'):
                kernel_size = _reduced_kernel_size_for_small_input(net, [8, 8])
                net = slim.avg_pool2d(
                    net,
                    kernel_size,
                    padding='VALID',
                    scope='AvgPool_1a_{}x{}'.format(*kernel_size))
                end_points['PreLogits'] = net
                # TODO :should I change name here
                # Do upsampling in inference
    return net, end_points