def build_ban(dataset, num_hid, op='', gamma=4, task='vqa'): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) v_att = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) q_att = weight_norm(nn.Linear(num_hid, 1), dim=None) w_emb2 = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) q_emb2 = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) v_att2 = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2) v_net2 = FCNet([dataset.v_dim, num_hid]) q_att2 = weight_norm(nn.Linear(num_hid, 1), dim=None) if task == 'vqa': b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifier(num_hid, num_hid * 2, 3, .5) classifier2 = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, .5) counter = Counter(objects) return BanModel(dataset, w_emb, q_emb, v_att, q_att, b_net, q_prj, c_prj, q_net, v_net, classifier, classifier2, counter, op, gamma, w_emb2, q_emb2, v_att2, v_net2, q_att2) elif task == 'flickr': return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) cls_att = NewAttention(dataset.cls_dim, q_emb.num_hid, num_hid) attr_att = NewAttention(dataset.attr_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) cls_net = FCNet([dataset.cls_dim, num_hid]) attr_net = FCNet([dataset.attr_dim, num_hid]) fusion_dim = 16000 mcb = CompactBilinearPooling(num_hid, num_hid, fusion_dim) classifier = SimpleClassifier(fusion_dim, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, cls_att, attr_att, q_net, v_net, cls_net, attr_net, classifier, mcb)
def __init__(self, vocab_size, embed_hidden=300, mlp_hidden=512): super(TopDown, self).__init__() self.vocab_size = vocab_size self.q_emb = nn.LSTM(embed_hidden, mlp_hidden, batch_first=True, bidirectional=True) self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden) self.verb_transform = nn.Linear(embed_hidden, mlp_hidden) self.v_att = NewAttention(mlp_hidden, mlp_hidden, mlp_hidden) '''self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.classifier = SimpleClassifier( mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5)''' self.classifier = nn.Sequential( nn.Linear(mlp_hidden * 7 *7 + mlp_hidden, mlp_hidden*8), nn.BatchNorm1d(mlp_hidden*8), nn.ReLU(inplace=True), nn.Dropout(0.5), nn.Linear(mlp_hidden * 8, mlp_hidden*8), nn.BatchNorm1d(mlp_hidden*8), nn.ReLU(inplace=True), nn.Dropout(0.5), )
def __init__(self, image_emb_size, qst_emb_size, no_ans): super(VQA_Model, self).__init__() emb_size = image_emb_size + qst_emb_size self.img_att = NewAttention(image_emb_size, qst_emb_size, qst_emb_size) #fc1_size = 1024 #emb_size = fc1_size + qst_desc_emb_size #self.linear1 = nn.Linear(img_ques_emb_size, fc1_size) self.linear = nn.Linear(emb_size, no_ans)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) if not dataset.bert: q_att = SelfAttention(q_emb.num_hid, num_hid) v_att = NewAttention(dataset.v_dim + 2, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) else: q_att = SelfAttention(768, num_hid) q_emb = FCNet([768, 768]) v_att = NewAttention(dataset.v_dim, 768, num_hid) q_net = FCNet([768, num_hid]) v_net = FCNet([dataset.v_dim + 2, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, q_att, v_att, q_net, v_net, classifier, dataset.bert)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
def __init__(self, image_emb_size, qst_emb_size, no_ans): super(VQA_Model, self).__init__() num_hid = 1024 #emb_size = image_emb_size + qst_emb_size self.img_att = NewAttention(image_emb_size, qst_emb_size, qst_emb_size) # self.linear = nn.Linear(emb_size, no_ans) self.q_net = FCNet([image_emb_size, num_hid]) self.v_net = FCNet([qst_emb_size, num_hid]) self.classifier = SimpleClassifier(num_hid, num_hid * 2, no_ans, 0.5)
def visualize_vqe(dataset, num_hid, att_dim, dec_dim): w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) generator = STDecoder( dataset.v_dim, num_hid, 300, dec_dim,\ dataset.explanation_dictionary.ntoken, 1, 0.5) return VQE(w_emb, q_emb, v_att, q_net, v_net, generator)
def __init__(self, vocab_size, embed_hidden=300, mlp_hidden=512): super(TopDown, self).__init__() self.vocab_size = vocab_size self.q_emb = nn.LSTM(embed_hidden, mlp_hidden, batch_first=True, bidirectional=True) self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden) self.v_att = NewAttention(mlp_hidden, mlp_hidden, mlp_hidden)
def build_LL_newatt(dataset, num_hid):#---------------------------------!!!!!! w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier_LL = SimpleClassifier(#--------!!!!! num_hid*4, num_hid /8, 1, 0.5)#-----!!!!! classifier_All = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return LL4ALModel(w_emb, q_emb, v_att, q_net, v_net, classifier_LL,classifier_All)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) c_1 = MLP(input_dim=1024, dimensions=[1024, 1024, dataset.num_ans_candidates]) c_2 = nn.Linear(dataset.num_ans_candidates, dataset.num_ans_candidates) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, c_1, c_2)
def build_vqae3_split(dataset, num_hid, att_dim, dec_dim): w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att_1 = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net_1 = FCNet([q_emb.num_hid, num_hid]) v_net_1 = FCNet([dataset.v_dim, num_hid]) v_att_2 = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net_2 = FCNet([q_emb.num_hid, num_hid]) v_net_2 = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) generator = STDecoder( dataset.v_dim, num_hid, 300, dec_dim,\ dataset.explanation_dictionary.ntoken, 1, 0.5) e_emb = ExplainEmbedding(generator.embed, 300, num_hid, 1, False, 0.0, 'GRU') T_vq = FCNet([num_hid, num_hid]) T_e = FCNet([e_emb.num_hid, num_hid]) return Split_VQAE(w_emb, q_emb, v_att_1, q_net_1, v_net_1, v_att_2, q_net_2, v_net_2, classifier, generator, e_emb, T_vq, T_e)
def build_vqae_newatt(dataset, num_hid, att_dim, dec_dim): w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) generator = STDecoder( dataset.v_dim, num_hid, 300, dec_dim,\ dataset.explanation_dictionary.ntoken, 1, 0.5) return VQAE(w_emb, q_emb, v_att, q_net, v_net, classifier, generator)
def build_multimodal_newatt(dataset, num_hid):#---------------------------------!!!!!! w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier_V = SimpleClassifier( 2048, num_hid * 2, dataset.num_ans_candidates, 0.5)#-------!!!!! classifier_Q = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) classifier_All = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return MultiModalModel(w_emb, q_emb, v_att, q_net, v_net, classifier_V,classifier_Q,classifier_All)
def build_stackatt(dataset, num_hid, args): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4) q_emb = QuestionEmbedding(300, num_hid, args.rnn_layer, False, 0.4) v_att = NewAttention(dataset.v_dim, 2048 + q_emb.num_hid, num_hid, 0.2) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) query_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) model = BaseModelStackAtt(w_emb, q_emb, v_att, q_net, v_net, query_net, classifier, args) return model
def build_CCB_model(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_ct_net = FCNet([q_emb.num_hid, num_hid]) q_cx_net = FCNet([q_emb.num_hid, num_hid]) v_ct_net = FCNet([dataset.v_dim, num_hid]) v_cx_net = FCNet([dataset.v_dim, num_hid]) classifier_fq = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) classifier_vq = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return CCB_Model(w_emb, q_emb, v_att, q_ct_net, q_cx_net, v_ct_net, classifier_fq, classifier_vq, v_cx_net)
def build_lstm_vqa(dataset, num_hid, att_dim, dec_dim): w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) generator = SATDecoder( dataset.v_dim, num_hid, 300, att_dim, dec_dim,\ dataset.explanation_dictionary.ntoken, 1, 0.5) #att_emb = nn.GRU(dataset.v_dim, num_hid, 1, False, batch_first=True) att_emb = nn.GRUCell(dataset.v_dim, num_hid) classifier = SimpleClassifier(num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return LSTM_VQA(w_emb, q_emb, v_att, q_net, v_net, generator, att_emb, classifier)
def build_vqae2_newatt(dataset, num_hid, emb_rnn='GRU'): w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, 1024, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, 1024, 1024) q_net = FCNet([1024, 1024]) v_net = FCNet([dataset.v_dim, 1024]) classifier = SimpleClassifier(1024, 1024 * 2, dataset.num_ans_candidates, 0.5) generator = STDecoder( dataset.v_dim, 1024, 300, 1024,\ dataset.explanation_dictionary.ntoken, 1, 0.5) e_emb = ExplainEmbedding(generator.embed, 300, num_hid, 1, False, 0.0, emb_rnn) e_net = FCNet([e_emb.num_hid, 1024]) return VQAE2(w_emb, q_emb, v_att, q_net, v_net, classifier, generator, e_emb, e_net)
def build_baseline0_newatt(dataset, num_hid, reconstruction, size=64, dropout_hid=0.0, gamma_r=0.0, adv_mode="wgan", logger=None): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid, dataset.v_dim, reconstruction, size, dropout_hid, gamma_r, adv_mode, logger)
def __init__(self, vocab_size, embed_hidden=300, mlp_hidden=512): super(TopDown, self).__init__() self.vocab_size = vocab_size self.v_att = NewAttention(mlp_hidden, 768, mlp_hidden) '''self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.classifier = SimpleClassifier( mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5)''' self.classifier = nn.Sequential( nn.Linear(mlp_hidden * 7 * 7, mlp_hidden * 8), nn.BatchNorm1d(mlp_hidden * 8), nn.ReLU(inplace=True), nn.Dropout(0.5), nn.Linear(mlp_hidden * 8, mlp_hidden * 8), nn.BatchNorm1d(mlp_hidden * 8), nn.ReLU(inplace=True), nn.Dropout(0.5), )
def build_baseline0_newatt(dataset, num_hid, reconstruction, layer=4, size=64, variant='', finetune=False, use_residual=False, use_feat_loss=False, dropout_hid=False, dropout_unet=False, logger=None): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid, dataset.v_dim, reconstruction, layer, size, variant, finetune, use_residual, use_feat_loss, dropout_hid, dropout_unet, logger)
def build_baseline0_newatt2(args, num_hid): w_emb = WordEmbedding(args.vocab_size, args.ninp, 0.0) q_emb = QuestionEmbedding2(args.ninp, num_hid, args.nlayers, True, 0.0) h_emb = QuestionEmbedding2(args.ninp, num_hid, args.nlayers, True, 0.0) v_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) h_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) qih_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) qhi_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) q_net = FCNet([q_emb.num_hid*2, num_hid*2]) v_net = FCNet([args.nhid*2, num_hid*2]) h_net = FCNet([args.nhid*2, num_hid*2]) qih_net = FCNet([args.nhid*2, num_hid*2]) qhi_net = FCNet([args.nhid*2, num_hid*2]) qhih_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) qihi_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) decoder = netG(args) return BaseModel2(w_emb, q_emb, h_emb, v_att, h_att, q_net, v_net, h_net, qih_att, qhi_att, qih_net, qhi_net, decoder, args, qhih_att, qihi_att)