コード例 #1
0
  def encode(self, feature, length, scope=None):
    """Encodes sequence features into representation.

    Args:
      feature: A [batch, max_sequence_len, dims] float tensor.
      length: A [batch] int tensor.

    Returns:
      A [batch, dims] float tensor.
    """
    with tf.name_scope('avg_pooling_encoder'):
      mask = tf.sequence_mask(
          length, maxlen=utils.get_tensor_shape(feature)[-2], dtype=tf.float32)
      feature = utils.masked_avg_nd(data=feature, mask=mask, dim=1)
      return tf.squeeze(feature, axis=1)
コード例 #2
0
def _average_encoding(sequence_feature, sequence_length):
  """Encodes sequence using Average pooling.

  Args:
    sequence_feature: a [batch_sequence, max_sequence_length, feature_dimensions].
      float tensor.
    sequence_length: a [batch_sequence] int tensor.

  Returns:
    sequence_emb: A [batch_sequence, common_dimensions] float tensor, 
      representing the embedding vectors.
  """
  (_, max_sequence_length, _) = utils.get_tensor_shape(sequence_feature)

  mask = tf.sequence_mask(
      sequence_length, maxlen=max_sequence_length, dtype=tf.float32)

  sequence_emb = utils.masked_avg_nd(sequence_feature, mask, dim=1)
  sequence_emb = tf.squeeze(sequence_emb, axis=1)
  return sequence_emb
コード例 #3
0
    def test_masked_avg_nd(self):
        tf.reset_default_graph()

        data = tf.placeholder(tf.float32, shape=[None, None, None])
        mask = tf.placeholder(tf.float32, shape=[None, None])
        masked_avgs = utils.masked_avg_nd(data, mask)

        with self.test_session() as sess:
            result = sess.run(masked_avgs,
                              feed_dict={
                                  data: [[[1, 2], [3, 4], [5, 6]],
                                         [[7, 8], [9, 10], [11, 12]]],
                                  mask: [[1, 0, 1], [0, 1, 0]]
                              })
            self.assertAllClose(result, [[[3, 4]], [[9, 10]]])

            result = sess.run(masked_avgs,
                              feed_dict={
                                  data: [[[1, 2], [3, 4], [5, 6]],
                                         [[7, 8], [9, 10], [11, 12]]],
                                  mask: [[0, 0, 0], [0, 0, 0]]
                              })
            self.assertAllClose(result, [[[0, 0]], [[0, 0]]])
コード例 #4
0
  def encode(self, feature, length, scope=None):
    """Encodes sequence features into representation.

    Args:
      feature: A [batch, max_sequence_len, dims] float tensor.
      length: A [batch] int tensor.

    Returns:
      A [batch, dims] float tensor.
    """
    options = self._model_proto
    is_training = self._is_training

    mask = tf.sequence_mask(
        length, maxlen=utils.get_tensor_shape(feature)[1], dtype=tf.float32)

    feature = tf.contrib.layers.fully_connected(
        inputs=feature,
        num_outputs=feature.get_shape()[-1].value,
        activation_fn=None,
        scope=scope)

    feature = utils.masked_avg_nd(data=feature, mask=mask, dim=1)
    return tf.squeeze(feature, axis=1)
コード例 #5
0
  def build_prediction(self, examples, **kwargs):
    """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
    options = self._model_proto
    is_training = self._is_training

    # Image CNN features.

    inputs = examples[InputDataFields.image]
    image_features = model_utils.calc_cnn_feature(
        inputs, options.cnn_options, is_training=is_training)

    with slim.arg_scope(
        build_hyperparams(options.image_fc_hyperparams, is_training)):
      image_features = slim.fully_connected(
          image_features,
          num_outputs=options.shared_dims,
          activation_fn=None,
          scope='image')

    # Text Global-Average-Pooling features.

    (image_id, num_captions, caption_strings,
     caption_lengths) = (examples[InputDataFields.image_id],
                         examples[InputDataFields.num_captions],
                         examples[InputDataFields.caption_strings],
                         examples[InputDataFields.caption_lengths])
    image_id = tf.string_to_number(image_id, out_type=tf.int64)

    (image_ids_gathered, caption_strings_gathered,
     caption_lengths_gathered) = model_utils.gather_in_batch_captions(
         image_id, num_captions, caption_strings, caption_lengths)

    (caption_token_ids_gathered,
     caption_features_gathered) = self._extract_text_feature(
         caption_strings_gathered,
         caption_lengths_gathered,
         vocabulary_list=self._open_vocabulary_list,
         initial_embedding=self._open_vocabulary_initial_embedding,
         embedding_dims=options.embedding_dims,
         trainable=options.train_word_embedding,
         max_norm=None)

    with slim.arg_scope(
        build_hyperparams(options.text_fc_hyperparams, is_training)):
      if visual_w2v_model_pb2.VisualW2vModel.ATT == options.text_feature_extractor:
        attn = slim.fully_connected(
            caption_features_gathered,
            num_outputs=1,
            activation_fn=None,
            scope='caption_attn')
        attn = tf.squeeze(attn, axis=-1)
      caption_features_gathered = slim.fully_connected(
          caption_features_gathered,
          num_outputs=options.shared_dims,
          activation_fn=None,
          scope='caption')

    oov = len(self._open_vocabulary_list)
    caption_masks_gathered = tf.logical_not(
        tf.equal(caption_token_ids_gathered, oov))
    caption_masks_gathered = tf.to_float(caption_masks_gathered)

    if visual_w2v_model_pb2.VisualW2vModel.GAP == options.text_feature_extractor:
      caption_features_gathered = utils.masked_avg_nd(
          data=caption_features_gathered, mask=caption_masks_gathered, dim=1)
      caption_features_gathered = tf.squeeze(caption_features_gathered, axis=1)
    elif visual_w2v_model_pb2.VisualW2vModel.ATT == options.text_feature_extractor:
      attn = utils.masked_softmax(attn, mask=caption_masks_gathered, dim=-1)
      caption_features_gathered = tf.multiply(
          tf.expand_dims(attn, axis=-1), caption_features_gathered)
      caption_features_gathered = utils.masked_sum_nd(
          caption_features_gathered, mask=caption_masks_gathered, dim=1)
      caption_features_gathered = tf.squeeze(caption_features_gathered, axis=1)
    else:
      raise ValueError('Invalid text feature extractor.')

    # Export token embeddings.

    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
      _, token_embeddings = self._encode_tokens(
          tokens=tf.constant(self._open_vocabulary_list),
          embedding_dims=options.embedding_dims,
          vocabulary_list=self._open_vocabulary_list,
          initial_embedding=self._open_vocabulary_initial_embedding,
          trainable=options.train_word_embedding)
      with slim.arg_scope(
          build_hyperparams(options.text_fc_hyperparams, is_training)):
        token_embeddings = slim.fully_connected(
            token_embeddings,
            num_outputs=options.shared_dims,
            activation_fn=None,
            scope='caption')
    var_to_assign = tf.get_variable(
        name='weights_proj',
        shape=[len(self._open_vocabulary_list), options.shared_dims])
    var_to_assign = tf.assign(var_to_assign, token_embeddings)
    tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, var_to_assign)

    tf.summary.histogram('token_embedding_proj', token_embeddings)

    # Compute similarity.

    similarity = model_utils.calc_pairwise_similarity(
        feature_a=image_features,
        feature_b=caption_features_gathered,
        l2_normalize=True,
        dropout_keep_prob=options.cross_modal_dropout_keep_prob,
        is_training=is_training)

    predictions = {
        VisualW2vPredictions.image_id: image_id,
        VisualW2vPredictions.image_ids_gathered: image_ids_gathered,
        VisualW2vPredictions.similarity: similarity,
        VisualW2vPredictions.word2vec: var_to_assign,
    }
    return predictions
コード例 #6
0
ファイル: stacked_attn_model.py プロジェクト: yekeren/WSOD
    def build_prediction(self, examples, **kwargs):
        """Builds tf graph for prediction.

    Args:
      examples: dict of input tensors keyed by name.
      prediction_task: the specific prediction task.

    Returns:
      predictions: dict of prediction results keyed by name.
    """
        options = self._model_proto
        is_training = self._is_training

        (inputs, num_proposals,
         proposals) = (examples[InputDataFields.image],
                       examples[InputDataFields.num_proposals],
                       examples[InputDataFields.proposals])

        predictions = {
            DetectionResultFields.num_proposals: num_proposals,
            DetectionResultFields.proposal_boxes: proposals,
        }

        # FRCNN.
        #   `proposal_features` shape = [batch, max_num_proposals, feature_dims].
        #   `proposal_masks` shape = [batch, max_num_proposals].

        proposal_features = self._extract_frcnn_feature(
            inputs, num_proposals, proposals)

        batch, max_num_proposals, _ = utils.get_tensor_shape(proposal_features)
        proposal_masks = tf.sequence_mask(num_proposals,
                                          maxlen=max_num_proposals,
                                          dtype=tf.float32)

        # Build the SADDN predictions.
        #   `logits_c_given_r` shape = [batch, max_num_proposals, num_classes].
        #   `logits_r_given_c` shape = [batch, max_num_proposals, num_classes].

        with tf.variable_scope('SADDN'), \
            slim.arg_scope(build_hyperparams(options.fc_hyperparams, is_training)):

            logits_c_given_r = slim.fully_connected(
                proposal_features,
                num_outputs=self._num_classes,
                activation_fn=None,
                scope='proba_c_given_r')
            logits_r_given_c = slim.fully_connected(
                proposal_features,
                num_outputs=self._num_classes,
                activation_fn=None,
                scope='proba_r_given_c')

            proba_c_given_r = tf.nn.softmax(logits_c_given_r)
            proba_r_given_c = utils.masked_softmax(
                data=logits_r_given_c,
                mask=tf.expand_dims(proposal_masks, axis=-1),
                dim=1)
            proba_r_given_c = tf.multiply(
                tf.expand_dims(proposal_masks, axis=-1), proba_r_given_c)

        tf.summary.image('inputs', inputs, max_outputs=10)
        model_utils.visl_proposals(inputs,
                                   num_proposals,
                                   proposals,
                                   name='proposals',
                                   top_k=2000)

        # SADDN iterations.

        logits_at_0 = utils.masked_avg_nd(data=logits_c_given_r,
                                          mask=proposal_masks,
                                          dim=1)
        logits_at_0 = tf.squeeze(logits_at_0, axis=1)

        logits_at_i = logits_at_0
        for i in range(options.saddn_iterations):
            # Infer the proba_r_given_c.

            # Infer the proba_c.

            proba_c_at_i = tf.nn.softmax(logits_at_i)
            import pdb
            pdb.set_trace()

            proba_r_at_i = tf.multiply(tf.expand_dims(proba_c_at_i, axis=1),
                                       proba_r_given_c)
            proba_r_at_i = tf.reduce_sum(proba_r_at_i, axis=-1, keepdims=True)

            # Infer the detection results at iter `i`.

            (num_detections_at_i, detection_boxes_at_i, detection_scores_at_i,
             detection_classes_at_i) = model_utils.post_process(
                 proposals, proba_r_at_i * proba_c_given_r)

            (predictions[StackedAttnPredictions.logits + '_at_{}'.format(i)],
             predictions[DetectionResultFields.num_detections +
                         '_at_{}'.format(i)],
             predictions[DetectionResultFields.detection_boxes +
                         '_at_{}'.format(i)],
             predictions[DetectionResultFields.detection_scores +
                         '_at_{}'.format(i)],
             predictions[DetectionResultFields.detection_classes +
                         '_at_{}'.format(i)]) = (logits_at_i,
                                                 num_detections_at_i,
                                                 detection_boxes_at_i,
                                                 detection_scores_at_i,
                                                 detection_classes_at_i)

            model_utils.visl_proposals_top_k(
                inputs,
                num_detections_at_i,
                detection_boxes_at_i,
                detection_scores_at_i,
                tf.gather(self._vocabulary_list,
                          tf.to_int32(detection_classes_at_i - 1)),
                name='detection_{}'.format(i))

            # `logits_at_i` for the next iteration.

            logits_at_i = tf.multiply(proba_r_at_i, logits_c_given_r)
            logits_at_i = tf.reduce_sum(logits_at_i, axis=1)

        return predictions