Esempio n. 1
0
    def __init__(self,
                 img_dim,
                 embed_size,
                 data_name,
                 use_abs=False,
                 no_imgnorm=False):
        super(EncoderImagePrecompAttn, self).__init__()
        self.embed_size = embed_size
        self.no_imgnorm = no_imgnorm
        self.data_name = data_name
        self.use_abs = use_abs

        self.fc = nn.Linear(img_dim, embed_size)

        self.init_weights()

        # GSR
        self.img_rnn = nn.GRU(embed_size, embed_size, 1, batch_first=True)

        # GCN reasoning
        self.Rs_GCN_1 = Rs_GCN(in_channels=embed_size,
                               inter_channels=embed_size)
        self.Rs_GCN_2 = Rs_GCN(in_channels=embed_size,
                               inter_channels=embed_size)
        self.Rs_GCN_3 = Rs_GCN(in_channels=embed_size,
                               inter_channels=embed_size)
        self.Rs_GCN_4 = Rs_GCN(in_channels=embed_size,
                               inter_channels=embed_size)
        if self.data_name == 'f30k_precomp':
            self.bn = nn.BatchNorm1d(embed_size)
    def __init__(self, vocab_size, word_dim, embed_size=2048, inter_channels=2048, use_atten=False, word_vec='./vocab/word2vec300d_init_threshold4.npy'):
        super(EncoderTextPrecompAttn, self).__init__()
        self.use_atten = use_atten
        self.embed_size = embed_size

        self.embed = nn.Embedding(vocab_size, word_dim)
        weight_init = torch.from_numpy(np.load(word_vec))
        self.embed.weight.data[:vocab_size] = weight_init

        self.fc = nn.Linear(word_dim, inter_channels)

        self.init_weights()


        # GSR
        self.img_rnn = nn.GRU(inter_channels, embed_size, 1, batch_first=True, bidirectional=True)

        # GCN reasoning
        self.Rs_GCN_1 = Rs_GCN(in_channels=inter_channels, inter_channels=inter_channels)
        self.Rs_GCN_2 = Rs_GCN(in_channels=inter_channels, inter_channels=inter_channels)
        self.Rs_GCN_3 = Rs_GCN(in_channels=inter_channels, inter_channels=inter_channels)
        self.Rs_GCN_4 = Rs_GCN(in_channels=inter_channels, inter_channels=inter_channels)
    def __init__(self, img_dim, embed_size, inter_channels=2048, use_atten=False, use_box=False, use_label=False):
        super(EncoderImagePrecompAttn, self).__init__()
        self.embed_size = embed_size
        self.use_atten = use_atten
        self.use_box = use_box
        self.use_label = use_label
        if self.use_box:
            self.fc_box = nn.Linear(6, embed_size)
        if self.use_label:
            self.fc_class = nn.Linear(embed_size, 33)
        #self.fc = nn.Linear(img_dim, inter_channels)

        #self.init_weights()


        # GSR
        self.img_rnn = nn.GRU(inter_channels, embed_size, 1, batch_first=True, bidirectional=True)

        # GCN reasoning
        self.Rs_GCN_1 = Rs_GCN(in_channels=inter_channels, inter_channels=inter_channels)
        self.Rs_GCN_2 = Rs_GCN(in_channels=inter_channels, inter_channels=inter_channels)
        self.Rs_GCN_3 = Rs_GCN(in_channels=inter_channels, inter_channels=inter_channels)
        self.Rs_GCN_4 = Rs_GCN(in_channels=inter_channels, inter_channels=inter_channels)