Example #1
0
def _soft_attention(im, ctx, embed_dim, keep_prob=1.0, scope=""):
    scope = scope or "Att"
    with tf.variable_scope(scope):
        im_ctx = mlb(im, ctx, embed_dim, keep_prob)
        fv = _soft_attention_pool(im, im_ctx)
    # fv = tf.expand_dims(fv, 1)
    return fv
Example #2
0
def build_attention_vaq_model(im,
                              ans_embed,
                              quest,
                              quest_len,
                              embed_dim,
                              vocab_size,
                              keep_prob,
                              pad_token,
                              num_dec_cells,
                              phase='train'):
    # average pooling over image
    fv = softmax_attention(im,
                           ans_embed,
                           embed_dim,
                           keep_prob=keep_prob,
                           scope='AnsAttention')
    in_embed = mlb(fv, ans_embed, embed_dim, keep_prob, scope='VAEmbed')
    with tf.variable_scope('vaq'):
        if phase == 'train':
            inputs, targets, length = _build_caption_inputs_and_targets(
                quest, quest_len)
            return build_lstm_decoder(in_embed, inputs, length, targets,
                                      vocab_size, num_dec_cells, keep_prob,
                                      pad_token)
        else:
            return build_lstm_predictor(in_embed, quest, vocab_size,
                                        num_dec_cells, pad_token)
Example #3
0
def compute_gates(g_im, ctx, embed_dim, num_outputs, keep_prob):
    # is_training = keep_prob != 1.0
    g_h = mlb(g_im, ctx, embed_dim, keep_prob=keep_prob, scope='gate')
    g_logits = slim.fully_connected(g_h,
                                    num_outputs,
                                    activation_fn=None,
                                    scope='g_logits')
    return tf.nn.softmax(g_logits)
Example #4
0
def low_rank_attention(im,
                       ctx,
                       embed_dim,
                       num_rank,
                       keep_prob,
                       scope='LR_att'):
    scope = scope or "LR_att"
    with tf.variable_scope(scope):
        im_ctx = mlb(im, ctx, embed_dim, keep_prob)
        fv = _low_rank_attention_pool(im, im_ctx, num_rank)
    return fv
Example #5
0
def semantic_attention(attr,
                       quest_embed,
                       embed_dim,
                       keep_prob,
                       scope='SemAtt'):
    with tf.variable_scope(scope):
        aq_embed = mlb(attr, quest_embed, embed_dim, keep_prob, scope='gates')
        gates = slim.fully_connected(aq_embed,
                                     1,
                                     activation_fn=tf.nn.sigmoid,
                                     scope='gates')
        # apply gates
        gated_attr = tf.mul(attr, gates)
    return gated_attr
def conditional_attention_cell_helper(im,
                                      a,
                                      part_q,
                                      embed_dim,
                                      keep_prob=1.0,
                                      scope=""):
    scope = scope or "ConditionalAttentionCell"
    _, h, w, c = im.get_shape().as_list()
    with tf.variable_scope(scope):
        # QA joint embedding
        ctx = concat_fusion(part_q, a, embed_dim)
        # soft attention
        im_ctx = mlb(im, ctx, embed_dim, keep_prob, scope='Matching')
        v, am = _soft_attention_pool_with_map(im, im_ctx)
        am = tf.reshape(am, shape=[-1, h * w])
    return v, ctx, am
 def __call__(self, inputs, state, scope=None):
     """Attention cell with answer context."""
     with tf.variable_scope(scope or type(self).__name__):
         with tf.variable_scope('Attention'):
             v, ctx, am = conditional_attention_cell_helper(
                 self._context,
                 self._answer_context,
                 inputs,
                 self._embed_dim,
                 keep_prob=self._keep_prob)
             h = mlb(v,
                     ctx,
                     self._embed_dim,
                     self._keep_prob,
                     scope='OutputMLB')
             # residual connection
             h = inputs + h
     return h, h
Example #8
0
def _scale_specific_vq_prediction(net,
                                  ctx,
                                  embed_dim,
                                  num_ans,
                                  keep_prob,
                                  scope,
                                  expand_dim=True):
    with tf.variable_scope(scope):
        v = _soft_attention(net, ctx, embed_dim, keep_prob=keep_prob)
        pre_logits = mlb(v, ctx, embed_dim, keep_prob, scope='pre_logits')
        logits = slim.fully_connected(pre_logits,
                                      num_ans,
                                      activation_fn=None,
                                      scope='logits')
        if expand_dim:
            return tf.expand_dims(logits, 1)
        else:
            return logits
 def __call__(self, inputs, state, scope='MultiModalAttentionCell'):
     """Attention cell with answer context."""
     with tf.variable_scope('MultiModalAttentionCell'):
         with tf.variable_scope('Attention'):
             v, ctx, am = conditional_attention_cell_helper(
                 self._context,
                 self._answer_context,
                 inputs,
                 self._embed_dim,
                 keep_prob=self._keep_prob)
             h = mlb(v,
                     ctx,
                     self._embed_dim,
                     self._keep_prob,
                     scope='OutputMLB')
             # residual connection
             new_h = inputs + h
     # new_h, new_state
     return new_h, h