def __init__(self, input_size, hidden_size, dropout):
        super(Sentinel, self).__init__()

        self.affine_x = nn.Linear(input_size, hidden_size)
        init_weights(self.affine_x, 'linear')
        self.affine_h = nn.Linear(hidden_size, hidden_size)
        init_weights(self.affine_h, 'linear')

        # Dropout applied before affine transformation
        self.dropout = nn.Dropout(dropout)
 def __init__(self, args):
     super(TransformerEncoderLayer, self).__init__()
     self.embed_dim = args.embed_size
     self.self_attn = MultiheadAttention(
             self.embed_dim, args.encoder_attention_heads,
             dropout=args.attention_dropout
             )
     self.dropout = args.dropout
     self.relu_dropout = args.relu_dropout
     self.normalize_before = args.encoder_normalize_before
     self.fc1 = nn.Linear(self.embed_dim, args.encoder_ffn_embed_dim)
     utils.init_weights(self.fc1, 'relu')
     self.fc2 = nn.Linear(args.encoder_ffn_embed_dim, self.embed_dim)
     utils.init_weights(self.fc2, 'relu')
     self.layer_norms = nn.ModuleList([nn.LayerNorm(self.embed_dim) for i in range(2)]) 
    def __init__(self, embed_size, hidden_size, decoder_attn_embed_size,
                 vocab_size, dropout):
        super(AdaptiveBlock, self).__init__()

        # Sentinel block
        self.sentinel = Sentinel(embed_size, hidden_size, dropout)

        # Image Spatial Attention Block
        self.atten = Atten(hidden_size, decoder_attn_embed_size, dropout)

        # Final Caption generator
        self.mlp = nn.Linear(hidden_size, vocab_size)
        init_weights(self.mlp, 'linear')

        # Dropout layer inside Affine Transformation
        self.dropout = nn.Dropout(dropout)

        self.hidden_size = hidden_size
    def __init__(self, hidden_size, decoder_attn_embed_size, dropout):
        super(Atten, self).__init__()

        self.affine_v = nn.Linear(hidden_size, decoder_attn_embed_size)  # W_v
        init_weights(self.affine_v, 'linear')
        self.affine_g = nn.Linear(hidden_size, decoder_attn_embed_size)  # W_g
        init_weights(self.affine_g, 'linear')
        self.affine_s = nn.Linear(hidden_size, decoder_attn_embed_size)  # W_s
        init_weights(self.affine_s, 'linear')
        self.affine_h = nn.Linear(decoder_attn_embed_size, 1)  # w_h
        init_weights(self.affine_h, 'linear')

        self.dropout = nn.Dropout(dropout)
    def __init__(self, args, word_emb):
        super(AdaptiveDecoder, self).__init__()
        # word embedding
        self.relu_dropout = args.relu_dropout
        self.embed = self.from_pretrained(word_emb, freeze=True)

        self.w_to_h = nn.Linear(args.embed_size, args.hidden_size)
        init_weights(self.w_to_h, 'relu')
        self.w_to_c = nn.Linear(args.embed_size, args.hidden_size)
        init_weights(self.w_to_c, 'relu')
        # LSTM decoder: input = [ w_t; v_g ] => 2 x word_embed_size;
        self.LSTM = nn.LSTM(args.embed_size,
                            args.hidden_size,
                            1,
                            batch_first=True)
        init_weights(self.LSTM)
        # Save hidden_size for hidden and cell variable
        self.hidden_size = args.hidden_size
        # Adaptive Attention Block:
        # Sentinel + C_hat + Final scores for caption sampling
        self.adaptive = AdaptiveBlock(args.embed_size, args.hidden_size,
                                      args.decoder_attn_embed_size,
                                      args.vocab_size, args.dropout)