def build_ban(v_dim, num_hid, gamma):
    #w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    #q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0)
    #v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma)
    v_att = BiAttention(v_dim, num_hid, num_hid, gamma)
    b_net = []
    q_prj = []
    c_prj = []
    objects = 10  # minimum number of boxes
    for i in range(gamma):
        b_net.append(BCNet(v_dim, num_hid, num_hid, None, k=1))
        q_prj.append(FCNet([num_hid, num_hid], '', .2))
        c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
    return BanModel(v_att, b_net, q_prj, c_prj, gamma)
Example #2
0
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa'):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False,
                              .0)
    v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma)
    if task == 'vqa':
        b_net = []
        q_prj = []
        c_prj = []
        objects = 10  # minimum number of boxes
        for i in range(gamma):
            b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1))
            q_prj.append(FCNet([num_hid, num_hid], '', .2))
            c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
        classifier = SimpleClassifier(num_hid, num_hid * 2,
                                      dataset.num_ans_candidates, .5)
        counter = Counter(objects)
        return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                        classifier, counter, op, gamma)
    elif task == 'flickr':
        return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
Example #3
0
    def __init__(self,
                 v_dim,
                 q_dim,
                 h_dim,
                 h_out,
                 act='ReLU',
                 dropout=[.2, .5],
                 k=3):
        super(BCNet, self).__init__()

        self.c = 32
        self.k = k
        self.v_dim = v_dim
        self.q_dim = q_dim
        self.h_dim = h_dim
        self.h_out = h_out

        self.v_net = FCNet([v_dim, h_dim * self.k],
                           act=act,
                           dropout=dropout[0])
        self.q_net = FCNet([q_dim, h_dim * self.k],
                           act=act,
                           dropout=dropout[0])
        self.dropout = nn.Dropout(dropout[1])  # attention
        if 1 < k:
            self.p_net = nn.AvgPool1d(self.k, stride=self.k)

        if None == h_out:
            pass
        elif h_out <= self.c:
            self.h_mat = nn.Parameter(
                torch.Tensor(1, h_out, 1, h_dim * self.k).normal_())
            self.h_bias = nn.Parameter(torch.Tensor(1, h_out, 1, 1).normal_())
        else:
            self.h_net = weight_norm(nn.Linear(h_dim * self.k, h_out),
                                     dim=None)
    def __init__(self,
                 vocabSize,
                 embedSize,
                 rnnHiddenSize,
                 numLayers,
                 useIm,
                 imgEmbedSize,
                 imgFeatureSize,
                 numRounds,
                 isAnswerer,
                 dropout=0,
                 startToken=None,
                 endToken=None,
                 **kwargs):
        super(Encoder, self).__init__()
        self.vocabSize = vocabSize
        self.embedSize = embedSize
        self.rnnHiddenSize = rnnHiddenSize
        self.numLayers = numLayers

        # assert self.numLayers > 1, "Less than 2 layers not supported!"
        if useIm:
            self.useIm = useIm if useIm != True else 'early'
        else:
            self.useIm = False
        self.imgEmbedSize = 512
        self.imgFeatureSize = 4096
        self.numRounds = numRounds
        self.dropout = dropout
        self.isAnswerer = isAnswerer
        self.startToken = startToken
        self.endToken = endToken

        # modules
        self.wordEmbed = nn.Embedding(
            self.vocabSize, self.embedSize,
            padding_idx=0)  # changable size of vocabulary padding to vocabSize embedded to embedSize
        # with 0
        # question encoder
        # image fuses early with words
        if self.useIm == 'early':
            quesInputSize = self.embedSize + self.imgEmbedSize
            self.imgNet = nn.Linear(self.imgFeatureSize, self.imgEmbedSize)
            self.imgEmbedDropout = nn.Dropout(0.5)
        elif self.useIm == 'late':
            quesInputSize = self.embedSize  # (300)
            self.imgNet = nn.Linear(self.imgFeatureSize, self.imgEmbedSize)  # (4096,300)
            self.imgEmbedDropout = nn.Dropout(0.5)
        elif self.isAnswerer:
            quesInputSize = self.embedSize
        else:
            quesInputSize = self.embedSize
        if self.isAnswerer:
            self.fc_c = nn.Linear(self.rnnHiddenSize, self.rnnHiddenSize)
            self.fc_h = nn.Linear(self.rnnHiddenSize, self.rnnHiddenSize)
            self.quesRNN = nn.LSTM(
                quesInputSize,
                self.rnnHiddenSize,
                self.numLayers,
                batch_first=True,
                dropout=self.dropout)
            self.Wq_1 = nn.Linear(self.rnnHiddenSize, self.rnnHiddenSize)
            self.Wh_1 = nn.Linear(self.rnnHiddenSize, self.rnnHiddenSize)
            self.fc1 = nn.Linear(2 * self.rnnHiddenSize, self.rnnHiddenSize)
            self.dialogRNN = nn.LSTM(self.rnnHiddenSize, self.rnnHiddenSize, self.numLayers, batch_first=False,
                                     dropout=self.dropout)
            self.q_net = FCNet([self.rnnHiddenSize, self.rnnHiddenSize])
            self.v_net = FCNet([self.rnnHiddenSize, self.rnnHiddenSize])

        else:
            self.ansRNN = nn.LSTM(
                quesInputSize,
                self.rnnHiddenSize,
                self.numLayers,
                batch_first=True,
                dropout=dropout)
            self.Wq_1 = nn.Linear(self.rnnHiddenSize, self.rnnHiddenSize)
            self.Wh_1 = nn.Linear(self.rnnHiddenSize, self.rnnHiddenSize)
            self.fc1 = nn.Linear(3 * self.rnnHiddenSize, self.rnnHiddenSize)
            self.dialogRNN = nn.LSTM(self.rnnHiddenSize, self.rnnHiddenSize, self.numLayers, batch_first=False,
                                     dropout=self.dropout)
        # history encoder
        self.factRNN = nn.LSTM(
            self.embedSize,  # (300)
            self.rnnHiddenSize,  # (512)
            self.numLayers,  # (2)
            batch_first=True,
            dropout=dropout)

        # define MN
        settings = {
            # "use_cuda": True,
            "num_vocab": self.vocabSize,
            "embedding_dim": self.embedSize,
            "sentence_size": 32,
            "max_hops": 1,
            "dropout": self.dropout
        }
        self.men_n2n = MemN2N(settings)
        self.ban = base_model.build_ban(v_dim=512, num_hid=512, gamma=4).cuda()
        self.ATTMODULE = ATT_MODULE()