def build_baseline0_newatt_lstm_bidirection(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding2(300, num_hid, 1, True, 0.0, rnn_type='LSTM') v_att = NewAttention2(dataset.v_dim, q_emb.out_hid, num_hid) q_net = FCNet([q_emb.out_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
def build_baseline0_newatt(dataset, num_hid, args): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.5) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.5) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, args)
def build_model(dataset, v_dim, num_hid, logger=None): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, logger)
def build_attention_model(dataset, args): num_hid = args.num_hid w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = SoftAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return Attention_Model(w_emb, q_emb, v_att, q_net, v_net, classifier)
def build_baseline0(dataset, num_hid): # [batch,ntoken+1,300] w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier( num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
def build_baseline0(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb1 = QuestionEmbedding1(300) q_emb2 = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb2.num_hid, num_hid) q_net = FCNet([num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier( num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return CNNModel1(w_emb, q_emb1, q_emb2, v_att, q_net, v_net, classifier)
def build_baseline0(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = StackAttention(num_hid, num_hid, num_hid) q_net = FCNet([num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) linear = torch.nn.Linear(dataset.v_dim, num_hid) classifier = SimpleClassifier(num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return SANModel(w_emb, q_emb, v_att, q_net, v_net, classifier, linear)
def build_dualatt(dataset, num_hid, args): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4) q_emb = QuestionEmbedding(300, num_hid, args.rnn_layer, False, 0.4) v_att = DualAttention(dataset.v_dim, q_emb.num_hid, num_hid, 0.2) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) model = BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, args) return model
def build_baseline2(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([num_hid, num_hid]) v_net = nn.Linear(dataset.v_dim, 300) v_bn = nn.BatchNorm1d(300, momentum=0.01) lstm = nn.LSTM(300, num_hid, 1, batch_first=True) classifier = SimpleClassifier(num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, lstm, v_bn)
def build_baseline(dataset): opt = config.parse_opt() w_emb = WordEmbedding(dataset.dictionary.ntokens(), 300, opt.EMB_DROPOUT) q_emb = QuestionEmbedding(300, opt.NUM_HIDDEN, opt.NUM_LAYER, opt.BIDIRECT, opt.L_RNN_DROPOUT) v_emb = VideoEmbedding(opt.C3D_SIZE + opt.RES_SIZE, opt.NUM_HIDDEN, opt.NUM_LAYER, opt.BIDIRECT, opt.L_RNN_DROPOUT) v_att = Attention(opt.NUM_HIDDEN, opt.MID_DIM, opt.FC_DROPOUT) classifier = SimpleClassifier(opt.NUM_HIDDEN, opt.MID_DIM, 1, opt.FC_DROPOUT) return BaseModel(w_emb, q_emb, v_att, classifier, v_emb)
def build_LL_newatt(dataset, num_hid):#---------------------------------!!!!!! w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier_LL = SimpleClassifier(#--------!!!!! num_hid*4, num_hid /8, 1, 0.5)#-----!!!!! classifier_All = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return LL4ALModel(w_emb, q_emb, v_att, q_net, v_net, classifier_LL,classifier_All)
def build_BAN(dataset, args, priotize_using_counter=False): # init word embedding module, question embedding module, and Attention network w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.op) q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid, 1, False, .0, args.rnn) v_att = BiAttention(dataset.v_dim, args.num_hid, args.num_hid, args.gamma) # build and load pre-trained MAML model if args.maml: weight_path = args.RAD_dir + '/' + args.maml_model_path print('load initial weights MAML from: %s' % (weight_path)) maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn) # build and load pre-trained Auto-encoder model if args.autoencoder: ae_v_emb = Auto_Encoder_Model() weight_path = args.RAD_dir + '/' + args.ae_model_path print('load initial weights DAE from: %s' % (weight_path)) ae_v_emb.load_state_dict(torch.load(weight_path)) # Loading tfidf weighted embedding if hasattr(args, 'tfidf'): w_emb = tfidf_loading(args.tfidf, w_emb, args) # Optional module: counter for BAN use_counter = args.use_counter if priotize_using_counter is None else priotize_using_counter if use_counter or priotize_using_counter: objects = 10 # minimum number of boxes if use_counter or priotize_using_counter: counter = Counter(objects) else: counter = None # init BAN residual network b_net = [] q_prj = [] c_prj = [] for i in range(args.gamma): b_net.append( BCNet(dataset.v_dim, args.num_hid, args.num_hid, None, k=1)) q_prj.append(FCNet([args.num_hid, args.num_hid], '', .2)) if use_counter or priotize_using_counter: c_prj.append(FCNet([objects + 1, args.num_hid], 'ReLU', .0)) # init classifier classifier = SimpleClassifier(args.num_hid, args.num_hid * 2, dataset.num_ans_candidates, args) # contruct VQA model and return if args.maml and args.autoencoder: return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, maml_v_emb, ae_v_emb) elif args.maml: return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, maml_v_emb, None) elif args.autoencoder: return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, None, ae_v_emb) return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, None, None)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) c_1 = MLP(input_dim=1024, dimensions=[1024, 1024, dataset.num_ans_candidates]) c_2 = nn.Linear(dataset.num_ans_candidates, dataset.num_ans_candidates) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, c_1, c_2)
def build_baseline(dataset,opt): opt=config.parse_opt() w_emb=WordEmbedding(dataset.dictionary.ntokens(),300,opt.EMB_DROPOUT) q_emb=QuestionEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT) v_emb=VideoEmbedding(opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT) v_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT) r_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT) v_fc=Videofc(opt.GLIMPSE,opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.FC_DROPOUT) a_emb=AnswerEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT) rela_emb = Rela_Module(opt.NUM_HIDDEN*3,opt.NUM_HIDDEN,opt.NUM_HIDDEN) classifier=SimpleClassifier(opt.NUM_HIDDEN,opt.MID_DIM,dataset.num_ans,opt.FC_DROPOUT) return BaseModel(w_emb,q_emb,v_emb,a_emb,v_att,v_fc,rela_emb,r_att,classifier,opt)
def build_multimodal_newatt(dataset, num_hid):#---------------------------------!!!!!! w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier_V = SimpleClassifier( 2048, num_hid * 2, dataset.num_ans_candidates, 0.5)#-------!!!!! classifier_Q = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) classifier_All = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return MultiModalModel(w_emb, q_emb, v_att, q_net, v_net, classifier_V,classifier_Q,classifier_All)
def build_CCB_model(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_ct_net = FCNet([q_emb.num_hid, num_hid]) q_cx_net = FCNet([q_emb.num_hid, num_hid]) v_ct_net = FCNet([dataset.v_dim, num_hid]) v_cx_net = FCNet([dataset.v_dim, num_hid]) classifier_fq = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) classifier_vq = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return CCB_Model(w_emb, q_emb, v_att, q_ct_net, q_cx_net, v_ct_net, classifier_fq, classifier_vq, v_cx_net)
def __init__(self, dataset, args): super(BAN_Model, self).__init__() self.args = args # init word embedding module, question embedding module, biAttention network, bi_residual network, and classifier self.w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.cat) self.q_emb = QuestionEmbedding(600 if args.cat else 300, args.hid_dim, 1, False, .0, args.rnn) # for close att+ resnet + classify self.close_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim, args.glimpse) self.close_resnet = BiResNet(args, dataset) self.close_classifier = SimpleClassifier(args.hid_dim, args.hid_dim * 2, dataset.num_close_candidates, args) # for open_att + resnet + classify self.open_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim, args.glimpse) self.open_resnet = BiResNet(args, dataset) self.open_classifier = SimpleClassifier(args.hid_dim, args.hid_dim * 2, dataset.num_open_candidates, args) # type attention: b * 1024 self.typeatt = typeAttention(dataset.dictionary.ntoken, './data/glove6b_init_300d.npy') # build and load pre-trained MAML model if args.maml: weight_path = args.data_dir + '/' + args.maml_model_path print('load initial weights MAML from: %s' % (weight_path)) self.maml = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn) # build and load pre-trained Auto-encoder model if args.autoencoder: self.ae = Auto_Encoder_Model() weight_path = args.data_dir + '/' + args.ae_model_path print('load initial weights DAE from: %s' % (weight_path)) self.ae.load_state_dict(torch.load(weight_path)) self.convert = nn.Linear(16384, 64) # Loading tfidf weighted embedding if hasattr(args, 'tfidf'): self.w_emb = tfidf_loading(args.tfidf, self.w_emb, args) # Loading the other net if args.other_model: pass
def build_caq_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid // 2]) v_net = FCNet([dataset.v_dim, num_hid // 2]) updated_query_composer = FCNet([num_hid + num_hid // 2, num_hid]) neighbour_attention = MultiHeadedAttention(4, num_hid // 2, dropout=0.1) Dropout_C = nn.Dropout(0.1) classifier = SimpleClassifier(num_hid // 2, num_hid * 2, dataset.num_ans_candidates + 1, 0.5) return CAQModel(w_emb, q_emb, v_att, q_net, v_net, updated_query_composer, neighbour_attention, Dropout_C, classifier, dataset)
class typeAttention(nn.Module): def __init__(self, size_question, path_init): super(typeAttention, self).__init__() self.w_emb = WordEmbedding(size_question, 300, 0.0, False) self.w_emb.init_embedding(path_init) self.q_emb = QuestionEmbedding(300, 1024, 1, False, 0.0, 'GRU') self.q_final = QuestionAttention(1024) self.f_fc1 = linear(1024, 2048) self.f_fc2 = linear(2048, 1024) self.f_fc3 = linear(1024, 1024) def forward(self, question): w_emb = self.w_emb(question) q_emb = self.q_emb.forward_all(w_emb) # [batch, q_len, q_dim] q_final = self.q_final(w_emb, q_emb) # b, 1024 x_f = self.f_fc1(q_final) x_f = F.relu(x_f) x_f = self.f_fc2(x_f) x_f = F.dropout(x_f) x_f = F.relu(x_f) x_f = self.f_fc3(x_f) return x_f
def build_baseline(dataset,opt): w_emb=WordEmbedding(dataset.dictionary.ntokens(),300,opt.EMB_DROPOUT) q_emb=QuestionEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT) v_emb=VideoEmbedding(opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT) v_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT) r_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT) v_fc=Videofc(opt.GLIMPSE,opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.FC_DROPOUT) a_emb=AnswerEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT) rela_emb = Rela_Module(opt.NUM_HIDDEN*3,opt.NUM_HIDDEN,opt.NUM_HIDDEN) classifier=SimpleClassifier(opt.NUM_HIDDEN*2,opt.MID_DIM,1,opt.FC_DROPOUT) ques_att = Q_Att(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT) #vlinear=FCNet([opt.NUM_HIDDEN,opt.MID_DIM,opt.NUM_HIDDEN]) #rlinear=FCNet([opt.NUM_HIDDEN,opt.MID_DIM,opt.NUM_HIDDEN]) return BaseModel(w_emb,q_emb,v_emb,a_emb,v_att,v_fc,rela_emb,r_att,classifier,ques_att,opt)
def build_model(dataset, num_hid, dropout, norm, activation, dropL, dropG, dropW, dropC): caption_w_emb = WordEmbedding(dataset.caption_dictionary.ntoken, emb_dim=300, dropout=dropW) caption_decoder_class = SimpleClassifier( in_dim=num_hid, hid_dim=2 * num_hid, out_dim=dataset.caption_dictionary.ntoken, dropout=dropC, norm=norm, act=activation) return CaptionDecoderRNN(300, num_hid, 2048, caption_w_emb, caption_decoder_class)
def build_baseline0_gcn(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding_all(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net0 = FCNet([q_emb.num_hid, num_hid]) v_net0 = FCNet([dataset.v_dim, num_hid]) gcn = FCNet([num_hid, num_hid]) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return GraphModel(w_emb, q_emb, v_att, q_net0, v_net0, gcn, q_net, v_net, classifier)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) cls_att = NewAttention(dataset.cls_dim, q_emb.num_hid, num_hid) attr_att = NewAttention(dataset.attr_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) cls_net = FCNet([dataset.cls_dim, num_hid]) attr_net = FCNet([dataset.attr_dim, num_hid]) fusion_dim = 16000 mcb = CompactBilinearPooling(num_hid, num_hid, fusion_dim) classifier = SimpleClassifier(fusion_dim, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, cls_att, attr_att, q_net, v_net, cls_net, attr_net, classifier, mcb)
def build_model_A2x3(dataset, num_hid, dropout, norm, activation, dropL, dropG, dropW, dropC): w_emb = WordEmbedding(dataset.dictionary.ntoken, emb_dim=300, dropout=dropW) q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, bidirect=False, dropout=dropG, rnn_type='GRU') v_att_1 = Att_2(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) v_att_2 = Att_2(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) v_att_3 = Att_2(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) q_net = FCNet([q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation) v_net = FCNet([dataset.v_dim, num_hid], dropout=dropL, norm=norm, act=activation) classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=dataset.num_ans_candidates, dropout=dropC, norm=norm, act=activation) return Model_3(w_emb, q_emb, v_att_1, v_att_2, v_att_3, q_net, v_net, classifier)
def build_ban_foil(dataset, num_hid, num_ans_candidates, op='', gamma=4): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma) b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifierFoil(num_hid, 64, num_ans_candidates) counter = Counter(objects) return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, op, gamma)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) if not dataset.bert: q_att = SelfAttention(q_emb.num_hid, num_hid) v_att = NewAttention(dataset.v_dim + 2, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) else: q_att = SelfAttention(768, num_hid) q_emb = FCNet([768, 768]) v_att = NewAttention(dataset.v_dim, 768, num_hid) q_net = FCNet([768, num_hid]) v_net = FCNet([dataset.v_dim + 2, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, q_att, v_att, q_net, v_net, classifier, dataset.bert)
def build_baseline0_newatt(dataset, num_hid, reconstruction, size=64, dropout_hid=0.0, gamma_r=0.0, adv_mode="wgan", logger=None): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid, dataset.v_dim, reconstruction, size, dropout_hid, gamma_r, adv_mode, logger)
def build_model_A3x2_h(dataset, num_hid, dropout, norm, activation, dropL, dropG, dropW, dropC): w_emb = WordEmbedding(dataset.dictionary.ntoken, emb_dim=300, dropout=dropW) q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, bidirect=False, dropout=dropG, rnn_type='GRU') print('v_dim: %d\tq_dim: %d\tnum_hid: %d\t num ans candidates: %d' % (dataset.v_dim, q_emb.num_hid, num_hid, dataset.num_ans_candidates)) v_att_1 = Att_3(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) v_att_2 = Att_3(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) q_net = FCNet([q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation) v_net = FCNet([dataset.v_dim, num_hid], dropout=dropL, norm=norm, act=activation) h_net = HNet([1280, 100, 100], [1280, 1280]) classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=dataset.num_ans_candidates, dropout=dropC, norm=norm, act=activation) return Model_h(w_emb, q_emb, v_att_1, v_att_2, q_net, v_net, h_net, classifier)
def build_baseline0_newatt2(args, num_hid): w_emb = WordEmbedding(args.vocab_size, args.ninp, 0.0) q_emb = QuestionEmbedding2(args.ninp, num_hid, args.nlayers, True, 0.0) h_emb = QuestionEmbedding2(args.ninp, num_hid, args.nlayers, True, 0.0) v_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) h_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) qih_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) qhi_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) q_net = FCNet([q_emb.num_hid*2, num_hid*2]) v_net = FCNet([args.nhid*2, num_hid*2]) h_net = FCNet([args.nhid*2, num_hid*2]) qih_net = FCNet([args.nhid*2, num_hid*2]) qhi_net = FCNet([args.nhid*2, num_hid*2]) qhih_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) qihi_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) decoder = netG(args) return BaseModel2(w_emb, q_emb, h_emb, v_att, h_att, q_net, v_net, h_net, qih_att, qhi_att, qih_net, qhi_net, decoder, args, qhih_att, qihi_att)
def build_SAN(dataset, args): # init word embedding module, question embedding module, and Attention network w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0, args.op) q_emb = QuestionEmbedding(300 if "c" not in args.op else 600, args.num_hid, 1, False, 0.0, args.rnn) v_att = StackedAttention( args.num_stacks, dataset.v_dim, args.num_hid, args.num_hid, dataset.num_ans_candidates, args.dropout, ) # build and load pre-trained MAML model if args.maml: weight_path = args.RAD_dir + "/" + args.maml_model_path print("load initial weights MAML from: %s" % (weight_path)) maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn) # build and load pre-trained Auto-encoder model if args.autoencoder: ae_v_emb = Auto_Encoder_Model() weight_path = args.RAD_dir + "/" + args.ae_model_path print("load initial weights DAE from: %s" % (weight_path)) ae_v_emb.load_state_dict(torch.load(weight_path, args.map_location)) # Loading tfidf weighted embedding if hasattr(args, "tfidf"): w_emb = tfidf_loading(args.tfidf, w_emb, args) # init classifier classifier = SimpleClassifier(args.num_hid, 2 * args.num_hid, dataset.num_ans_candidates, args) # contruct VQA model and return if args.maml and args.autoencoder: return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb, ae_v_emb) elif args.maml: return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb, None) elif args.autoencoder: return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, ae_v_emb) return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, None)