def build_stackatt(dataset, num_hid, args): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4) q_emb = QuestionEmbedding(300, num_hid, args.rnn_layer, False, 0.4) v_att = NewAttention(dataset.v_dim, 2048 + q_emb.num_hid, num_hid, 0.2) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) query_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) model = BaseModelStackAtt(w_emb, q_emb, v_att, q_net, v_net, query_net, classifier, args) return model
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) # match dimensions v_net = FCNet([dataset.v_dim, num_hid]) # match dimensions u_net = FCNet([dataset.v_dim, num_hid]) # match dimensions classifier = SimpleClassifier(num_hid * 2, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, u_net, classifier)
def build_CCB_model(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_ct_net = FCNet([q_emb.num_hid, num_hid]) q_cx_net = FCNet([q_emb.num_hid, num_hid]) v_ct_net = FCNet([dataset.v_dim, num_hid]) v_cx_net = FCNet([dataset.v_dim, num_hid]) classifier_fq = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) classifier_vq = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return CCB_Model(w_emb, q_emb, v_att, q_ct_net, q_cx_net, v_ct_net, classifier_fq, classifier_vq, v_cx_net)
def build_caq_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid // 2]) v_net = FCNet([dataset.v_dim, num_hid // 2]) updated_query_composer = FCNet([num_hid + num_hid // 2, num_hid]) neighbour_attention = MultiHeadedAttention(4, num_hid // 2, dropout=0.1) Dropout_C = nn.Dropout(0.1) classifier = SimpleClassifier(num_hid // 2, num_hid * 2, dataset.num_ans_candidates + 1, 0.5) return CAQModel(w_emb, q_emb, v_att, q_net, v_net, updated_query_composer, neighbour_attention, Dropout_C, classifier, dataset)
def build_baseline(dataset,opt): w_emb=WordEmbedding(dataset.dictionary.ntokens(),300,opt.EMB_DROPOUT) q_emb=QuestionEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT) v_emb=VideoEmbedding(opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT) v_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT) r_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT) v_fc=Videofc(opt.GLIMPSE,opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.FC_DROPOUT) a_emb=AnswerEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT) rela_emb = Rela_Module(opt.NUM_HIDDEN*3,opt.NUM_HIDDEN,opt.NUM_HIDDEN) classifier=SimpleClassifier(opt.NUM_HIDDEN*2,opt.MID_DIM,1,opt.FC_DROPOUT) ques_att = Q_Att(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT) #vlinear=FCNet([opt.NUM_HIDDEN,opt.MID_DIM,opt.NUM_HIDDEN]) #rlinear=FCNet([opt.NUM_HIDDEN,opt.MID_DIM,opt.NUM_HIDDEN]) return BaseModel(w_emb,q_emb,v_emb,a_emb,v_att,v_fc,rela_emb,r_att,classifier,ques_att,opt)
def build_baseline(dataset, opt): opt = config.parse_opt() w_emb = WordEmbedding(dataset.dictionary.ntokens(), 300, opt.EMB_DROPOUT) q_emb = QuestionEmbedding(300, opt.NUM_HIDDEN, opt.NUM_LAYER, opt.BIDIRECT, opt.L_RNN_DROPOUT) v_emb = VideoEmbedding(opt.C3D_SIZE + opt.RES_SIZE, opt.NUM_HIDDEN, opt.NUM_LAYER, opt.BIDIRECT, opt.L_RNN_DROPOUT) v_att = Attention(opt.NUM_HIDDEN, opt.MID_DIM, opt.FC_DROPOUT) r_att = Attention(opt.NUM_HIDDEN, opt.MID_DIM, opt.FC_DROPOUT) v_fc = Videofc(opt.GLIMPSE, opt.C3D_SIZE + opt.RES_SIZE, opt.NUM_HIDDEN, opt.FC_DROPOUT) a_emb = AnswerEmbedding(300, opt.NUM_HIDDEN, opt.NUM_LAYER, opt.BIDIRECT, opt.L_RNN_DROPOUT) rela_emb = Rela_Module(opt.NUM_HIDDEN * 3, opt.NUM_HIDDEN, opt.NUM_HIDDEN) return BaseModel(w_emb, q_emb, v_emb, a_emb, v_att, v_fc, rela_emb, r_att, opt)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) cls_att = NewAttention(dataset.cls_dim, q_emb.num_hid, num_hid) attr_att = NewAttention(dataset.attr_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) cls_net = FCNet([dataset.cls_dim, num_hid]) attr_net = FCNet([dataset.attr_dim, num_hid]) fusion_dim = 16000 mcb = CompactBilinearPooling(num_hid, num_hid, fusion_dim) classifier = SimpleClassifier(fusion_dim, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, cls_att, attr_att, q_net, v_net, cls_net, attr_net, classifier, mcb)
def build_model_A2x3(dataset, num_hid, dropout, norm, activation, dropL, dropG, dropW, dropC): w_emb = WordEmbedding(dataset.dictionary.ntoken, emb_dim=300, dropout=dropW) q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, bidirect=False, dropout=dropG, rnn_type='GRU') v_att_1 = Att_2(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) v_att_2 = Att_2(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) v_att_3 = Att_2(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) q_net = FCNet([q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation) v_net = FCNet([dataset.v_dim, num_hid], dropout=dropL, norm=norm, act=activation) classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=dataset.num_ans_candidates, dropout=dropC, norm=norm, act=activation) return Model_3(w_emb, q_emb, v_att_1, v_att_2, v_att_3, q_net, v_net, classifier)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) if not dataset.bert: q_att = SelfAttention(q_emb.num_hid, num_hid) v_att = NewAttention(dataset.v_dim + 2, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) else: q_att = SelfAttention(768, num_hid) q_emb = FCNet([768, 768]) v_att = NewAttention(dataset.v_dim, 768, num_hid) q_net = FCNet([768, num_hid]) v_net = FCNet([dataset.v_dim + 2, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, q_att, v_att, q_net, v_net, classifier, dataset.bert)
def build_ban_foil(dataset, num_hid, num_ans_candidates, op='', gamma=4): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma) b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifierFoil(num_hid, 64, num_ans_candidates) counter = Counter(objects) return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, op, gamma)
def __init__(self, opt): super(UpDn, self).__init__() num_hid = opt.num_hid activation = opt.activation dropG = opt.dropG dropW = opt.dropW dropout = opt.dropout dropL = opt.dropL norm = opt.norm dropC = opt.dropC self.opt = opt print(f"ntokens {opt.ntokens}") self.w_emb = WordEmbedding(opt.ntokens, emb_dim=300, dropout=dropW) self.w_emb.init_embedding(f'{opt.data_dir}/glove6b_init_300d.npy') # self.q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, # bidirect=False, dropout=dropG, rnn_type='GRU') self.q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid) self.q_net = FCNet([self.q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation) self.gv_net = FCNet([2048, num_hid], dropout=dropL, norm=norm, act=activation) self.gv_att_1 = Att_3(v_dim=2048, q_dim=self.q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) self.gv_att_2 = Att_3(v_dim=2048, q_dim=self.q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) self.classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=3129, dropout=dropC, norm=norm, act=activation)
def build_baseline0_newatt(dataset, num_hid, reconstruction, size=64, dropout_hid=0.0, gamma_r=0.0, adv_mode="wgan", logger=None): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid, dataset.v_dim, reconstruction, size, dropout_hid, gamma_r, adv_mode, logger)
def build_model_A3x2_h(dataset, num_hid, dropout, norm, activation, dropL, dropG, dropW, dropC): w_emb = WordEmbedding(dataset.dictionary.ntoken, emb_dim=300, dropout=dropW) q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, bidirect=False, dropout=dropG, rnn_type='GRU') print('v_dim: %d\tq_dim: %d\tnum_hid: %d\t num ans candidates: %d' % (dataset.v_dim, q_emb.num_hid, num_hid, dataset.num_ans_candidates)) v_att_1 = Att_3(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) v_att_2 = Att_3(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) q_net = FCNet([q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation) v_net = FCNet([dataset.v_dim, num_hid], dropout=dropL, norm=norm, act=activation) h_net = HNet([1280, 100, 100], [1280, 1280]) classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=dataset.num_ans_candidates, dropout=dropC, norm=norm, act=activation) return Model_h(w_emb, q_emb, v_att_1, v_att_2, q_net, v_net, h_net, classifier)
def build_SAN(dataset, args): # init word embedding module, question embedding module, and Attention network w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0, args.op) q_emb = QuestionEmbedding(300 if "c" not in args.op else 600, args.num_hid, 1, False, 0.0, args.rnn) v_att = StackedAttention( args.num_stacks, dataset.v_dim, args.num_hid, args.num_hid, dataset.num_ans_candidates, args.dropout, ) # build and load pre-trained MAML model if args.maml: weight_path = args.RAD_dir + "/" + args.maml_model_path print("load initial weights MAML from: %s" % (weight_path)) maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn) # build and load pre-trained Auto-encoder model if args.autoencoder: ae_v_emb = Auto_Encoder_Model() weight_path = args.RAD_dir + "/" + args.ae_model_path print("load initial weights DAE from: %s" % (weight_path)) ae_v_emb.load_state_dict(torch.load(weight_path, args.map_location)) # Loading tfidf weighted embedding if hasattr(args, "tfidf"): w_emb = tfidf_loading(args.tfidf, w_emb, args) # init classifier classifier = SimpleClassifier(args.num_hid, 2 * args.num_hid, dataset.num_ans_candidates, args) # contruct VQA model and return if args.maml and args.autoencoder: return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb, ae_v_emb) elif args.maml: return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb, None) elif args.autoencoder: return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, ae_v_emb) return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, None)
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa'): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) v_att = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2) q_net = FCNet([q_emb.num_hid, num_hid], 'Sigmoid') v_net = FCNet([dataset.v_dim, num_hid]) if task == 'vqa': b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, .5) counter = Counter(objects) return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, q_net, v_net, classifier, counter, op, gamma) elif task == 'flickr': return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
def build_fine(dataset, num_hid, args): cnn = getattr(resnet, args.cnn_model)() cnn.load_state_dict( torch.load(os.path.join(args.model_root, args.cnn_model + '.pth'))) my_cnn = myResnet(cnn) for param in my_cnn.parameters(): param.requires_grad = False for param in my_cnn.resnet.layer4.parameters(): param.requires_grad = True w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.4) v_att = DualAttention(dataset.v_dim, q_emb.num_hid, num_hid, 0.2) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) model = BaseModelWithCNN(w_emb, q_emb, v_att, q_net, v_net, classifier, my_cnn, args) return model
def build_model_P_mod(dataset, num_hid, dropout, norm, activation, dropL, dropG, dropW, dropC): w_emb = WordEmbedding(dataset.dictionary.ntoken, emb_dim=300, dropout=dropW) q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, bidirect=False, dropout=dropG, rnn_type='GRU') v_att = Att_P(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) q_net = GTH(q_emb.num_hid, num_hid, dropout=dropL, norm=norm, act=activation) v_net = GTH(dataset.v_dim, num_hid, dropout=dropL, norm=norm, act=activation) classifier = PaperClassifier(in_dim=num_hid, hid_dim_1=300, hid_dim_2=2048, out_dim=dataset.num_ans_candidates, dropout=dropC, norm=norm, act=activation) return Model(w_emb, q_emb, v_att, q_net, v_net, classifier)
def build_baseline0_newatt(dataset, num_hid, reconstruction, layer=4, size=64, variant='', finetune=False, use_residual=False, use_feat_loss=False, dropout_hid=False, dropout_unet=False, logger=None): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid, dataset.v_dim, reconstruction, layer, size, variant, finetune, use_residual, use_feat_loss, dropout_hid, dropout_unet, logger)
class typeAttention(nn.Module): def __init__(self, size_question, path_init): super(typeAttention, self).__init__() self.w_emb = WordEmbedding(size_question, 300, 0.0, False) self.w_emb.init_embedding(path_init) self.q_emb = QuestionEmbedding(300, 1024, 1, False, 0.0, 'GRU') self.q_final = QuestionAttention(1024) self.f_fc1 = linear(1024, 2048) self.f_fc2 = linear(2048, 1024) self.f_fc3 = linear(1024, 1024) def forward(self, question): w_emb = self.w_emb(question) q_emb = self.q_emb.forward_all(w_emb) # [batch, q_len, q_dim] q_final = self.q_final(w_emb, q_emb) # b, 1024 x_f = self.f_fc1(q_final) x_f = F.relu(x_f) x_f = self.f_fc2(x_f) x_f = F.dropout(x_f) x_f = F.relu(x_f) x_f = self.f_fc3(x_f) return x_f
def build_BAN(dataset, args, priotize_using_counter=False): # init word embedding module, question embedding module, and Attention network w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0, args.op) q_emb = QuestionEmbedding(300 if "c" not in args.op else 600, args.num_hid, 1, False, 0.0, args.rnn) v_att = BiAttention(dataset.v_dim, args.num_hid, args.num_hid, args.gamma) # build and load pre-trained MAML model if args.maml: weight_path = args.RAD_dir + "/" + args.maml_model_path print("load initial weights MAML from: %s" % (weight_path)) maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn) # build and load pre-trained Auto-encoder model if args.autoencoder: ae_v_emb = Auto_Encoder_Model() weight_path = args.RAD_dir + "/" + args.ae_model_path print("load initial weights DAE from: %s" % (weight_path)) ae_v_emb.load_state_dict(torch.load(weight_path, args.map_location)) # Loading tfidf weighted embedding if hasattr(args, "tfidf"): w_emb = tfidf_loading(args.tfidf, w_emb, args) # Optional module: counter for BAN use_counter = (args.use_counter if priotize_using_counter is None else priotize_using_counter) if use_counter or priotize_using_counter: objects = 10 # minimum number of boxes if use_counter or priotize_using_counter: counter = Counter(objects) else: counter = None # init BAN residual network b_net = [] q_prj = [] c_prj = [] for i in range(args.gamma): b_net.append( BCNet(dataset.v_dim, args.num_hid, args.num_hid, None, k=1)) q_prj.append(FCNet([args.num_hid, args.num_hid], "", 0.2)) if use_counter or priotize_using_counter: c_prj.append(FCNet([objects + 1, args.num_hid], "ReLU", 0.0)) # init classifier classifier = SimpleClassifier(args.num_hid, args.num_hid * 2, dataset.num_ans_candidates, args) # contruct VQA model and return if args.maml and args.autoencoder: return BAN_Model( dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, maml_v_emb, ae_v_emb, ) elif args.maml: return BAN_Model( dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, maml_v_emb, None, ) elif args.autoencoder: return BAN_Model( dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, None, ae_v_emb, ) return BAN_Model( dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, None, None, )
utils.assert_eq(len(tokens), max_length) ans_tokens.append(tokens) return ans_tokens def create_answer_embedding(ans_list, dictionary, w_emb, ans_emb): ans_tokens = tokenize(ans_list, dictionary) ans_tokens = torch.from_numpy(np.array(ans_tokens)) answer_embedding = torch.zeros(3129, 1024) for idx, ans in enumerate(ans_tokens): ans = ans.unsqueeze(0) w = w_emb(ans) ans = ans_emb(w) answer_embedding[idx] = ans.squeeze() with open('data/answer_embedding.pkl', 'wb') as f: cPickle.dump(answer_embedding, f) if __name__ == '__main__': dictionary = Dictionary.load_from_file('data/dictionary.pkl') w_emb = WordEmbedding(dictionary.ntoken, 300, .0, 'c') w_emb.init_embedding('data/glove6b_init_300d.npy', None, None) ans_emb = QuestionEmbedding(600, 1024, 1, False, .0) ans2label_path = ans2label_path = os.path.join('data', 'cache', 'trainval_ans2label.pkl') ans2label = cPickle.load(open(ans2label_path, 'rb')) ans_list = [ans for ans in ans2label] create_answer_embedding(ans_list, dictionary, w_emb, ans_emb)
return loss def compute_score_with_logits(logits, labels): logits = torch.max(logits, 1)[1].data # argmax one_hots = torch.zeros(*labels.size()).cuda() one_hots.scatter_(1, logits.view(-1, 1), 1) scores = (one_hots * labels) return scores pred2.shape tmp = instance_bce_with_logits(pred2.to('cpu'), a2.to('cpu')) tmp.size() tmp.shape input = torch.randn(3, requires_grad=True) target = torch.empty(3).random_(2) loss = torch.nn.functional.binary_cross_entropy_with_logits(input, target) loss.backward() len(tmp[0]) tmp from language_model import QuestionEmbedding, WordEmbedding, QuestionEmbedding2 num_hid = 1024 w_emb = WordEmbedding(dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) q_emb2 = QuestionEmbedding2(300, num_hid, 1, True, 0.0) w_emb = w_emb(q) q_emb = q_emb(w_emb) # [batch, q_dim] q_emb2 = q_emb2(w_emb) q_emb.shape q.shape q_emb2.shape type(q_emb2)
class BAN_Model(nn.Module): def __init__(self, dataset,args): super(BAN_Model, self).__init__() self.args = args # init word embedding module, question embedding module, biAttention network, bi_residual network, and classifier self.w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.cat) self.q_emb = QuestionEmbedding(600 if args.cat else 300, args.hid_dim, 1, False, .0, args.rnn) self.bi_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim, args.glimpse) self.bi_resnet = BiResNet(args,dataset) self.classifier = SimpleClassifier(args.hid_dim, args.hid_dim * 2, dataset.num_ans_candidates, args) # build and load pre-trained MAML model if args.maml: weight_path = args.data_dir + '/' + args.maml_model_path print('load initial weights MAML from: %s' % (weight_path)) self.maml = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn) # build and load pre-trained Auto-encoder model if args.autoencoder: self.ae = Auto_Encoder_Model() weight_path = args.data_dir + '/' + args.ae_model_path print('load initial weights DAE from: %s' % (weight_path)) self.ae.load_state_dict(torch.load(weight_path)) self.convert = nn.Linear(16384, 64) # Loading tfidf weighted embedding if hasattr(args, 'tfidf'): self.w_emb = tfidf_loading(args.tfidf, self.w_emb, args) # Loading the other net if args.other_model: self.unet = Resnet50Encoder() def forward(self, v, q): """Forward v: [batch, num_objs, obj_dim] b: [batch, num_objs, b_dim] q: [batch_size, seq_length] return: logits, not probs """ # get visual feature if self.args.maml: maml_v_emb = self.maml(v[0]).unsqueeze(1) v_emb = maml_v_emb if self.args.autoencoder: encoder = self.ae.forward_pass(v[1]) decoder = self.ae.reconstruct_pass(encoder) ae_v_emb = encoder.view(encoder.shape[0], -1) ae_v_emb = self.convert(ae_v_emb).unsqueeze(1) v_emb = ae_v_emb if self.args.maml and self.args.autoencoder: v_emb = torch.cat((maml_v_emb, ae_v_emb), 2) if self.args.other_model: v_emb = self.unet(v) #input: b,c,h,w c==3 ; output= b,c,1,1 v_emb = v_emb.squeeze(3).squeeze(2).unsqueeze(1) # b,1,c # get lextual feature w_emb = self.w_emb(q) q_emb = self.q_emb.forward_all(w_emb) # [batch, q_len, q_dim] # Attention att_p, logits = self.bi_att(v_emb, q_emb) # b x g x v x q # bilinear residual network last_output = self.bi_resnet(v_emb,q_emb,att_p) if self.args.autoencoder: return last_output, decoder return last_output def classify(self, input_feats): return self.classifier(input_feats)
class BAN_Model(nn.Module): def __init__(self, dataset, args): super(BAN_Model, self).__init__() self.args = args # init word embedding module, question embedding module, biAttention network, bi_residual network, and classifier self.w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.cat) self.q_emb = QuestionEmbedding(600 if args.cat else 300, args.hid_dim, 1, False, .0, args.rnn) # for close att+ resnet + classify self.close_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim, args.glimpse) self.close_resnet = BiResNet(args, dataset) self.close_classifier = SimpleClassifier(args.hid_dim, args.hid_dim * 2, dataset.num_close_candidates, args) # for open_att + resnet + classify self.open_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim, args.glimpse) self.open_resnet = BiResNet(args, dataset) self.open_classifier = SimpleClassifier(args.hid_dim, args.hid_dim * 2, dataset.num_open_candidates, args) # type attention: b * 1024 self.typeatt = typeAttention(dataset.dictionary.ntoken, './data/glove6b_init_300d.npy') # build and load pre-trained MAML model if args.maml: weight_path = args.data_dir + '/' + args.maml_model_path print('load initial weights MAML from: %s' % (weight_path)) self.maml = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn) # build and load pre-trained Auto-encoder model if args.autoencoder: self.ae = Auto_Encoder_Model() weight_path = args.data_dir + '/' + args.ae_model_path print('load initial weights DAE from: %s' % (weight_path)) self.ae.load_state_dict(torch.load(weight_path)) self.convert = nn.Linear(16384, 64) # Loading tfidf weighted embedding if hasattr(args, 'tfidf'): self.w_emb = tfidf_loading(args.tfidf, self.w_emb, args) # Loading the other net if args.other_model: pass def forward(self, v, q, a, answer_target): """Forward v: [batch, num_objs, obj_dim] b: [batch, num_objs, b_dim] q: [batch_size, seq_length] return: logits, not probs """ # get visual feature if self.args.maml: maml_v_emb = self.maml(v[0]).unsqueeze(1) v_emb = maml_v_emb if self.args.autoencoder: encoder = self.ae.forward_pass(v[1]) decoder = self.ae.reconstruct_pass(encoder) ae_v_emb = encoder.view(encoder.shape[0], -1) ae_v_emb = self.convert(ae_v_emb).unsqueeze(1) v_emb = ae_v_emb if self.args.maml and self.args.autoencoder: v_emb = torch.cat((maml_v_emb, ae_v_emb), 2) if self.args.other_model: pass # get type attention type_att = self.typeatt(q) # get lextual feature global w_emb = self.w_emb(q) q_emb = self.q_emb.forward_all(w_emb) # [batch, q_len, q_dim] # get open & close feature v_open, v_close, q_open, q_close, a_open, a_close, typeatt_open, typeatt_close = seperate( v_emb, q_emb, a, type_att, answer_target) # diverse Attention -> (open + close) # att_p, logits = self.bi_att(v_emb, q_emb) # b x g x v x q att_close, _ = self.close_att(v_close, q_close) att_open, _ = self.open_att(v_open, q_open) # bilinear residual network # last_output = self.bi_resnet(v_emb,q_emb,att_p) last_output_close = self.close_resnet(v_close, q_close, att_close) last_output_open = self.open_resnet(v_open, q_open, att_open) #type attention (5.19 try) last_output_close = last_output_close * typeatt_close last_output_open = last_output_open * typeatt_open if self.args.autoencoder: return last_output_close, last_output_open, a_close, a_open, decoder return last_output_close, last_output_open, a_close, a_open def classify(self, close_feat, open_feat): return self.close_classifier(close_feat), self.open_classifier( open_feat) def forward_classify(self, v, q, a, classify): # get visual feature if self.args.maml: maml_v_emb = self.maml(v[0]).unsqueeze(1) v_emb = maml_v_emb if self.args.autoencoder: encoder = self.ae.forward_pass(v[1]) decoder = self.ae.reconstruct_pass(encoder) ae_v_emb = encoder.view(encoder.shape[0], -1) ae_v_emb = self.convert(ae_v_emb).unsqueeze(1) v_emb = ae_v_emb if self.args.maml and self.args.autoencoder: v_emb = torch.cat((maml_v_emb, ae_v_emb), 2) if self.args.other_model: pass # get type attention type_att = self.typeatt(q) # get lextual feature global w_emb = self.w_emb(q) q_emb = self.q_emb.forward_all(w_emb) # [batch, q_len, q_dim] # get open & close feature answer_target = classify(q) _, predicted = torch.max(answer_target, 1) v_open, v_close, q_open, q_close, a_open, a_close, typeatt_open, typeatt_close = seperate( v_emb, q_emb, a, type_att, predicted) # diverse Attention -> (open + close) # att_p, logits = self.bi_att(v_emb, q_emb) # b x g x v x q att_close, _ = self.close_att(v_close, q_close) att_open, _ = self.open_att(v_open, q_open) # bilinear residual network # last_output = self.bi_resnet(v_emb,q_emb,att_p) last_output_close = self.close_resnet(v_close, q_close, att_close) last_output_open = self.open_resnet(v_open, q_open, att_open) # type attention (5.19 try) last_output_close = last_output_close * typeatt_close last_output_open = last_output_open * typeatt_open if self.args.autoencoder: return last_output_close, last_output_open, a_close, a_open, decoder return last_output_close, last_output_open, a_close, a_open