def build_ban(v_dim, num_hid, gamma): #w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) #q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) #v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma) v_att = BiAttention(v_dim, num_hid, num_hid, gamma) b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) return BanModel(v_att, b_net, q_prj, c_prj, gamma)
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa'): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma) if task == 'vqa': b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, .5) counter = Counter(objects) return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, op, gamma) elif task == 'flickr': return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
def __init__(self, v_dim, q_dim, h_dim, h_out, act='ReLU', dropout=[.2, .5], k=3): super(BCNet, self).__init__() self.c = 32 self.k = k self.v_dim = v_dim self.q_dim = q_dim self.h_dim = h_dim self.h_out = h_out self.v_net = FCNet([v_dim, h_dim * self.k], act=act, dropout=dropout[0]) self.q_net = FCNet([q_dim, h_dim * self.k], act=act, dropout=dropout[0]) self.dropout = nn.Dropout(dropout[1]) # attention if 1 < k: self.p_net = nn.AvgPool1d(self.k, stride=self.k) if None == h_out: pass elif h_out <= self.c: self.h_mat = nn.Parameter( torch.Tensor(1, h_out, 1, h_dim * self.k).normal_()) self.h_bias = nn.Parameter(torch.Tensor(1, h_out, 1, 1).normal_()) else: self.h_net = weight_norm(nn.Linear(h_dim * self.k, h_out), dim=None)
def __init__(self, vocabSize, embedSize, rnnHiddenSize, numLayers, useIm, imgEmbedSize, imgFeatureSize, numRounds, isAnswerer, dropout=0, startToken=None, endToken=None, **kwargs): super(Encoder, self).__init__() self.vocabSize = vocabSize self.embedSize = embedSize self.rnnHiddenSize = rnnHiddenSize self.numLayers = numLayers # assert self.numLayers > 1, "Less than 2 layers not supported!" if useIm: self.useIm = useIm if useIm != True else 'early' else: self.useIm = False self.imgEmbedSize = 512 self.imgFeatureSize = 4096 self.numRounds = numRounds self.dropout = dropout self.isAnswerer = isAnswerer self.startToken = startToken self.endToken = endToken # modules self.wordEmbed = nn.Embedding( self.vocabSize, self.embedSize, padding_idx=0) # changable size of vocabulary padding to vocabSize embedded to embedSize # with 0 # question encoder # image fuses early with words if self.useIm == 'early': quesInputSize = self.embedSize + self.imgEmbedSize self.imgNet = nn.Linear(self.imgFeatureSize, self.imgEmbedSize) self.imgEmbedDropout = nn.Dropout(0.5) elif self.useIm == 'late': quesInputSize = self.embedSize # (300) self.imgNet = nn.Linear(self.imgFeatureSize, self.imgEmbedSize) # (4096,300) self.imgEmbedDropout = nn.Dropout(0.5) elif self.isAnswerer: quesInputSize = self.embedSize else: quesInputSize = self.embedSize if self.isAnswerer: self.fc_c = nn.Linear(self.rnnHiddenSize, self.rnnHiddenSize) self.fc_h = nn.Linear(self.rnnHiddenSize, self.rnnHiddenSize) self.quesRNN = nn.LSTM( quesInputSize, self.rnnHiddenSize, self.numLayers, batch_first=True, dropout=self.dropout) self.Wq_1 = nn.Linear(self.rnnHiddenSize, self.rnnHiddenSize) self.Wh_1 = nn.Linear(self.rnnHiddenSize, self.rnnHiddenSize) self.fc1 = nn.Linear(2 * self.rnnHiddenSize, self.rnnHiddenSize) self.dialogRNN = nn.LSTM(self.rnnHiddenSize, self.rnnHiddenSize, self.numLayers, batch_first=False, dropout=self.dropout) self.q_net = FCNet([self.rnnHiddenSize, self.rnnHiddenSize]) self.v_net = FCNet([self.rnnHiddenSize, self.rnnHiddenSize]) else: self.ansRNN = nn.LSTM( quesInputSize, self.rnnHiddenSize, self.numLayers, batch_first=True, dropout=dropout) self.Wq_1 = nn.Linear(self.rnnHiddenSize, self.rnnHiddenSize) self.Wh_1 = nn.Linear(self.rnnHiddenSize, self.rnnHiddenSize) self.fc1 = nn.Linear(3 * self.rnnHiddenSize, self.rnnHiddenSize) self.dialogRNN = nn.LSTM(self.rnnHiddenSize, self.rnnHiddenSize, self.numLayers, batch_first=False, dropout=self.dropout) # history encoder self.factRNN = nn.LSTM( self.embedSize, # (300) self.rnnHiddenSize, # (512) self.numLayers, # (2) batch_first=True, dropout=dropout) # define MN settings = { # "use_cuda": True, "num_vocab": self.vocabSize, "embedding_dim": self.embedSize, "sentence_size": 32, "max_hops": 1, "dropout": self.dropout } self.men_n2n = MemN2N(settings) self.ban = base_model.build_ban(v_dim=512, num_hid=512, gamma=4).cuda() self.ATTMODULE = ATT_MODULE()