def build_ParalCoAtt(task_name, dataset, params): # w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) # q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) num_hid = params['num_hid'] q_proj = FCNet([768, num_hid]) bi_num_hid = num_hid * 2 co_atts = nn.ModuleList([ ParalCoAttention(dataset.v_dim, num_hid, num_hid, inter_dims=params['scale'], R=len(params['scale'])) for _ in range(params['reasonSteps']) ]) v_fusion_att = paraAttention(fuse_dim=dataset.v_dim, glimpses=params['sub_nums'], inputs_dim=dataset.v_dim, att_dim=num_hid) q_fusion_att = paraAttention(fuse_dim=num_hid, glimpses=params['sub_nums'], inputs_dim=num_hid, att_dim=num_hid) context_gate = FCNet([bi_num_hid, bi_num_hid]) classifier = SimpleClassifier(bi_num_hid, num_hid * 2, 1, 0.5) return ActionModel(task_name, q_proj, co_atts, q_fusion_att, v_fusion_att, context_gate, classifier)
def build_baseline(task_name, dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_emb = QuestionEmbedding(dataset.v_dim, num_hid, 1, False, 0.0) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([num_hid, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, 1, 0.5) return CountModel(task_name, w_emb, q_emb, v_emb, q_net, v_net, classifier)
def build_temporalAtt(task_name, n_layer, dataset, num_hid, dictionary, glove_file): vid_encoder = Encoder(n_layer=n_layer, n_head=8, d_k=256, d_v=256, v_len=36, v_emb_dim=300, d_model=2048, d_inner_hid=512, dropout=0.1) w = WordEmbedding(dictionary.ntoken, dictionary.c_ntoken, 300, 64, 0.1) word_mat, char_mat = w.init_embedding(dictionary, glove_file, task_name) ques_encoder = Ques_Encoder(word_mat, char_mat) classifier = SimpleClassifier( num_hid, num_hid * 2, 1, 0.5) # classifier = weight_norm(nn.Linear(num_hid, 1), dim=None) return CountModel(task_name, vid_encoder, ques_encoder, classifier)
def build_temporalAtt(task_name, dataset, params): # w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) # q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) num_hid = params['num_hid'] q_proj = FCNet([768, num_hid]) bi_num_hid = num_hid*2 co_att = CoAttention(dataset.v_dim, num_hid, bi_num_hid) v_fusion_att = paraAttention(fuse_dim=dataset.v_dim, glimpses=params['sub_nums'], inputs_dim=dataset.v_dim, att_dim=num_hid) q_fusion_att = paraAttention(fuse_dim=num_hid, glimpses=params['sub_nums'], inputs_dim=num_hid, att_dim=num_hid) classifier = SimpleClassifier( bi_num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return FrameQAModel(task_name, q_proj, co_att, q_fusion_att, v_fusion_att, classifier)
def build_temporalAtt(task_name, dataset, params): num_hid = params['num_hid'] q_proj = FCNet([768, num_hid]) bi_num_hid = num_hid * 2 co_att = CoAttention(dataset.v_dim, num_hid, bi_num_hid) v_fusion_att = paraAttention(fuse_dim=dataset.v_dim, glimpses=params['sub_nums'], inputs_dim=dataset.v_dim, att_dim=num_hid) q_fusion_att = paraAttention(fuse_dim=num_hid, glimpses=params['sub_nums'], inputs_dim=num_hid, att_dim=num_hid) classifier = SimpleClassifier(2 * num_hid, num_hid * 2, 1, 0.5) return ActionModel(task_name, q_proj, co_att, q_fusion_att, v_fusion_att, classifier)