Ejemplo n.º 1
0
def build_baseline0_newatt_lstm_bidirection(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding2(300, num_hid, 1, True, 0.0, rnn_type='LSTM')
    v_att = NewAttention2(dataset.v_dim, q_emb.out_hid, num_hid)
    q_net = FCNet([q_emb.out_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
def build_baseline0_newatt(dataset, num_hid, args):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.5)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.5)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, args)
def build_model(dataset, v_dim, num_hid, logger=None):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, logger)
def build_attention_model(dataset, args):
    num_hid = args.num_hid
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = SoftAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return Attention_Model(w_emb, q_emb, v_att, q_net, v_net, classifier)
Ejemplo n.º 5
0
def build_baseline0(dataset, num_hid):
    # [batch,ntoken+1,300]
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(
        num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
def build_baseline0(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb1 = QuestionEmbedding1(300)
    q_emb2 = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb2.num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(
        num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5)
    return CNNModel1(w_emb, q_emb1, q_emb2, v_att, q_net, v_net, classifier)
Ejemplo n.º 7
0
def build_baseline0(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = StackAttention(num_hid, num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    linear = torch.nn.Linear(dataset.v_dim, num_hid)
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return SANModel(w_emb, q_emb, v_att, q_net, v_net, classifier, linear)
def build_dualatt(dataset, num_hid, args):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4)
    q_emb = QuestionEmbedding(300, num_hid, args.rnn_layer, False, 0.4)
    v_att = DualAttention(dataset.v_dim, q_emb.num_hid, num_hid, 0.2)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)

    model = BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, args)
    return model
Ejemplo n.º 9
0
def build_baseline2(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = nn.Linear(dataset.v_dim, 300)
    v_bn = nn.BatchNorm1d(300, momentum=0.01)
    lstm = nn.LSTM(300, num_hid, 1, batch_first=True)
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, lstm, v_bn)
Ejemplo n.º 10
0
def build_baseline(dataset):
    opt = config.parse_opt()
    w_emb = WordEmbedding(dataset.dictionary.ntokens(), 300, opt.EMB_DROPOUT)
    q_emb = QuestionEmbedding(300, opt.NUM_HIDDEN, opt.NUM_LAYER, opt.BIDIRECT,
                              opt.L_RNN_DROPOUT)
    v_emb = VideoEmbedding(opt.C3D_SIZE + opt.RES_SIZE, opt.NUM_HIDDEN,
                           opt.NUM_LAYER, opt.BIDIRECT, opt.L_RNN_DROPOUT)
    v_att = Attention(opt.NUM_HIDDEN, opt.MID_DIM, opt.FC_DROPOUT)
    classifier = SimpleClassifier(opt.NUM_HIDDEN, opt.MID_DIM, 1,
                                  opt.FC_DROPOUT)
    return BaseModel(w_emb, q_emb, v_att, classifier, v_emb)
def build_LL_newatt(dataset, num_hid):#---------------------------------!!!!!!
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier_LL = SimpleClassifier(#--------!!!!!
        num_hid*4, num_hid /8, 1, 0.5)#-----!!!!!
    classifier_All = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    return LL4ALModel(w_emb, q_emb, v_att, q_net, v_net, classifier_LL,classifier_All)
Ejemplo n.º 12
0
def build_BAN(dataset, args, priotize_using_counter=False):
    # init word embedding module, question embedding module, and Attention network
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.op)
    q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid,
                              1, False, .0, args.rnn)
    v_att = BiAttention(dataset.v_dim, args.num_hid, args.num_hid, args.gamma)
    # build and load pre-trained MAML model
    if args.maml:
        weight_path = args.RAD_dir + '/' + args.maml_model_path
        print('load initial weights MAML from: %s' % (weight_path))
        maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
    # build and load pre-trained Auto-encoder model
    if args.autoencoder:
        ae_v_emb = Auto_Encoder_Model()
        weight_path = args.RAD_dir + '/' + args.ae_model_path
        print('load initial weights DAE from: %s' % (weight_path))
        ae_v_emb.load_state_dict(torch.load(weight_path))
    # Loading tfidf weighted embedding
    if hasattr(args, 'tfidf'):
        w_emb = tfidf_loading(args.tfidf, w_emb, args)
    # Optional module: counter for BAN
    use_counter = args.use_counter if priotize_using_counter is None else priotize_using_counter
    if use_counter or priotize_using_counter:
        objects = 10  # minimum number of boxes
    if use_counter or priotize_using_counter:
        counter = Counter(objects)
    else:
        counter = None
    # init BAN residual network
    b_net = []
    q_prj = []
    c_prj = []
    for i in range(args.gamma):
        b_net.append(
            BCNet(dataset.v_dim, args.num_hid, args.num_hid, None, k=1))
        q_prj.append(FCNet([args.num_hid, args.num_hid], '', .2))
        if use_counter or priotize_using_counter:
            c_prj.append(FCNet([objects + 1, args.num_hid], 'ReLU', .0))
    # init classifier
    classifier = SimpleClassifier(args.num_hid, args.num_hid * 2,
                                  dataset.num_ans_candidates, args)
    # contruct VQA model and return
    if args.maml and args.autoencoder:
        return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                         classifier, counter, args, maml_v_emb, ae_v_emb)
    elif args.maml:
        return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                         classifier, counter, args, maml_v_emb, None)
    elif args.autoencoder:
        return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                         classifier, counter, args, None, ae_v_emb)
    return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                     classifier, counter, args, None, None)
Ejemplo n.º 13
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    c_1 = MLP(input_dim=1024,
              dimensions=[1024, 1024, dataset.num_ans_candidates])
    c_2 = nn.Linear(dataset.num_ans_candidates, dataset.num_ans_candidates)
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, c_1, c_2)
Ejemplo n.º 14
0
def build_baseline(dataset,opt):
    opt=config.parse_opt()
    w_emb=WordEmbedding(dataset.dictionary.ntokens(),300,opt.EMB_DROPOUT)
    q_emb=QuestionEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    v_emb=VideoEmbedding(opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    v_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    r_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    v_fc=Videofc(opt.GLIMPSE,opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.FC_DROPOUT)
    a_emb=AnswerEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    rela_emb = Rela_Module(opt.NUM_HIDDEN*3,opt.NUM_HIDDEN,opt.NUM_HIDDEN)
    classifier=SimpleClassifier(opt.NUM_HIDDEN,opt.MID_DIM,dataset.num_ans,opt.FC_DROPOUT)
    return BaseModel(w_emb,q_emb,v_emb,a_emb,v_att,v_fc,rela_emb,r_att,classifier,opt)
def build_multimodal_newatt(dataset, num_hid):#---------------------------------!!!!!!
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier_V = SimpleClassifier(
        2048, num_hid * 2, dataset.num_ans_candidates, 0.5)#-------!!!!!
    classifier_Q = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    classifier_All = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    return MultiModalModel(w_emb, q_emb, v_att, q_net, v_net, classifier_V,classifier_Q,classifier_All)
Ejemplo n.º 16
0
def build_CCB_model(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_ct_net = FCNet([q_emb.num_hid, num_hid])
    q_cx_net = FCNet([q_emb.num_hid, num_hid])
    v_ct_net = FCNet([dataset.v_dim, num_hid])
    v_cx_net = FCNet([dataset.v_dim, num_hid])
    classifier_fq = SimpleClassifier(num_hid, num_hid * 2,
                                     dataset.num_ans_candidates, 0.5)
    classifier_vq = SimpleClassifier(num_hid, num_hid * 2,
                                     dataset.num_ans_candidates, 0.5)
    return CCB_Model(w_emb, q_emb, v_att, q_ct_net, q_cx_net, v_ct_net,
                     classifier_fq, classifier_vq, v_cx_net)
Ejemplo n.º 17
0
    def __init__(self, dataset, args):
        super(BAN_Model, self).__init__()

        self.args = args
        # init word embedding module, question embedding module, biAttention network, bi_residual network, and classifier
        self.w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0,
                                   args.cat)
        self.q_emb = QuestionEmbedding(600 if args.cat else 300, args.hid_dim,
                                       1, False, .0, args.rnn)

        # for close att+ resnet + classify
        self.close_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim,
                                     args.glimpse)
        self.close_resnet = BiResNet(args, dataset)
        self.close_classifier = SimpleClassifier(args.hid_dim,
                                                 args.hid_dim * 2,
                                                 dataset.num_close_candidates,
                                                 args)

        # for open_att + resnet + classify
        self.open_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim,
                                    args.glimpse)
        self.open_resnet = BiResNet(args, dataset)
        self.open_classifier = SimpleClassifier(args.hid_dim, args.hid_dim * 2,
                                                dataset.num_open_candidates,
                                                args)

        # type attention: b * 1024
        self.typeatt = typeAttention(dataset.dictionary.ntoken,
                                     './data/glove6b_init_300d.npy')

        # build and load pre-trained MAML model
        if args.maml:
            weight_path = args.data_dir + '/' + args.maml_model_path
            print('load initial weights MAML from: %s' % (weight_path))
            self.maml = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
        # build and load pre-trained Auto-encoder model
        if args.autoencoder:
            self.ae = Auto_Encoder_Model()
            weight_path = args.data_dir + '/' + args.ae_model_path
            print('load initial weights DAE from: %s' % (weight_path))
            self.ae.load_state_dict(torch.load(weight_path))
            self.convert = nn.Linear(16384, 64)
        # Loading tfidf weighted embedding
        if hasattr(args, 'tfidf'):
            self.w_emb = tfidf_loading(args.tfidf, self.w_emb, args)

        # Loading the other net
        if args.other_model:
            pass
Ejemplo n.º 18
0
def build_caq_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid // 2])
    v_net = FCNet([dataset.v_dim, num_hid // 2])
    updated_query_composer = FCNet([num_hid + num_hid // 2, num_hid])
    neighbour_attention = MultiHeadedAttention(4, num_hid // 2, dropout=0.1)
    Dropout_C = nn.Dropout(0.1)

    classifier = SimpleClassifier(num_hid // 2, num_hid * 2,
                                  dataset.num_ans_candidates + 1, 0.5)
    return CAQModel(w_emb, q_emb, v_att, q_net, v_net, updated_query_composer,
                    neighbour_attention, Dropout_C, classifier, dataset)
Ejemplo n.º 19
0
class typeAttention(nn.Module):
    def __init__(self, size_question, path_init):
        super(typeAttention, self).__init__()
        self.w_emb = WordEmbedding(size_question, 300, 0.0, False)
        self.w_emb.init_embedding(path_init)
        self.q_emb = QuestionEmbedding(300, 1024, 1, False, 0.0, 'GRU')
        self.q_final = QuestionAttention(1024)
        self.f_fc1 = linear(1024, 2048)
        self.f_fc2 = linear(2048, 1024)
        self.f_fc3 = linear(1024, 1024)

    def forward(self, question):
        w_emb = self.w_emb(question)
        q_emb = self.q_emb.forward_all(w_emb)  # [batch, q_len, q_dim]
        q_final = self.q_final(w_emb, q_emb)  # b, 1024

        x_f = self.f_fc1(q_final)
        x_f = F.relu(x_f)
        x_f = self.f_fc2(x_f)
        x_f = F.dropout(x_f)
        x_f = F.relu(x_f)
        x_f = self.f_fc3(x_f)

        return x_f
Ejemplo n.º 20
0
def build_baseline(dataset,opt):
    w_emb=WordEmbedding(dataset.dictionary.ntokens(),300,opt.EMB_DROPOUT)
    q_emb=QuestionEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    v_emb=VideoEmbedding(opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    v_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    r_att=Attention(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    v_fc=Videofc(opt.GLIMPSE,opt.C3D_SIZE+opt.RES_SIZE,opt.NUM_HIDDEN,opt.FC_DROPOUT)
    a_emb=AnswerEmbedding(300,opt.NUM_HIDDEN,opt.NUM_LAYER,opt.BIDIRECT,opt.L_RNN_DROPOUT)
    rela_emb = Rela_Module(opt.NUM_HIDDEN*3,opt.NUM_HIDDEN,opt.NUM_HIDDEN)
    classifier=SimpleClassifier(opt.NUM_HIDDEN*2,opt.MID_DIM,1,opt.FC_DROPOUT)
    ques_att = Q_Att(opt.NUM_HIDDEN,opt.MID_DIM,opt.FC_DROPOUT)
    #vlinear=FCNet([opt.NUM_HIDDEN,opt.MID_DIM,opt.NUM_HIDDEN])
    #rlinear=FCNet([opt.NUM_HIDDEN,opt.MID_DIM,opt.NUM_HIDDEN])
    
    return BaseModel(w_emb,q_emb,v_emb,a_emb,v_att,v_fc,rela_emb,r_att,classifier,ques_att,opt)
Ejemplo n.º 21
0
def build_model(dataset, num_hid, dropout, norm, activation, dropL, dropG,
                dropW, dropC):

    caption_w_emb = WordEmbedding(dataset.caption_dictionary.ntoken,
                                  emb_dim=300,
                                  dropout=dropW)
    caption_decoder_class = SimpleClassifier(
        in_dim=num_hid,
        hid_dim=2 * num_hid,
        out_dim=dataset.caption_dictionary.ntoken,
        dropout=dropC,
        norm=norm,
        act=activation)
    return CaptionDecoderRNN(300, num_hid, 2048, caption_w_emb,
                             caption_decoder_class)
Ejemplo n.º 22
0
def build_baseline0_gcn(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding_all(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)

    q_net0 = FCNet([q_emb.num_hid, num_hid])
    v_net0 = FCNet([dataset.v_dim, num_hid])

    gcn = FCNet([num_hid, num_hid])

    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return GraphModel(w_emb, q_emb, v_att, q_net0, v_net0, gcn, q_net, v_net,
                      classifier)
Ejemplo n.º 23
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    cls_att = NewAttention(dataset.cls_dim, q_emb.num_hid, num_hid)
    attr_att = NewAttention(dataset.attr_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    cls_net = FCNet([dataset.cls_dim, num_hid])
    attr_net = FCNet([dataset.attr_dim, num_hid])

    fusion_dim = 16000
    mcb = CompactBilinearPooling(num_hid, num_hid, fusion_dim)
    classifier = SimpleClassifier(fusion_dim, num_hid * 2, dataset.num_ans_candidates, 0.5)
    
    return BaseModel(w_emb, q_emb, v_att, cls_att, attr_att, q_net, v_net, cls_net, attr_net, classifier, mcb)
Ejemplo n.º 24
0
def build_model_A2x3(dataset, num_hid, dropout, norm, activation, dropL, dropG,
                     dropW, dropC):
    w_emb = WordEmbedding(dataset.dictionary.ntoken,
                          emb_dim=300,
                          dropout=dropW)
    q_emb = QuestionEmbedding(in_dim=300,
                              num_hid=num_hid,
                              nlayers=1,
                              bidirect=False,
                              dropout=dropG,
                              rnn_type='GRU')

    v_att_1 = Att_2(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    v_att_2 = Att_2(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    v_att_3 = Att_2(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    q_net = FCNet([q_emb.num_hid, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)
    v_net = FCNet([dataset.v_dim, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)

    classifier = SimpleClassifier(in_dim=num_hid,
                                  hid_dim=2 * num_hid,
                                  out_dim=dataset.num_ans_candidates,
                                  dropout=dropC,
                                  norm=norm,
                                  act=activation)
    return Model_3(w_emb, q_emb, v_att_1, v_att_2, v_att_3, q_net, v_net,
                   classifier)
Ejemplo n.º 25
0
def build_ban_foil(dataset, num_hid, num_ans_candidates, op='', gamma=4):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False,
                              .0)
    v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma)
    b_net = []
    q_prj = []
    c_prj = []
    objects = 10  # minimum number of boxes
    for i in range(gamma):
        b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1))
        q_prj.append(FCNet([num_hid, num_hid], '', .2))
        c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
    classifier = SimpleClassifierFoil(num_hid, 64, num_ans_candidates)
    counter = Counter(objects)
    return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                    classifier, counter, op, gamma)
Ejemplo n.º 26
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    if not dataset.bert:
        q_att = SelfAttention(q_emb.num_hid, num_hid)
        v_att = NewAttention(dataset.v_dim + 2, q_emb.num_hid, num_hid)
        q_net = FCNet([q_emb.num_hid, num_hid])
    else:
        q_att = SelfAttention(768, num_hid)
        q_emb = FCNet([768, 768])
        v_att = NewAttention(dataset.v_dim, 768, num_hid)
        q_net = FCNet([768, num_hid])
    v_net = FCNet([dataset.v_dim + 2, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, q_att, v_att, q_net, v_net, classifier,
                     dataset.bert)
Ejemplo n.º 27
0
def build_baseline0_newatt(dataset,
                           num_hid,
                           reconstruction,
                           size=64,
                           dropout_hid=0.0,
                           gamma_r=0.0,
                           adv_mode="wgan",
                           logger=None):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid,
                     dataset.v_dim, reconstruction, size, dropout_hid, gamma_r,
                     adv_mode, logger)
Ejemplo n.º 28
0
def build_model_A3x2_h(dataset, num_hid, dropout, norm, activation, dropL,
                       dropG, dropW, dropC):
    w_emb = WordEmbedding(dataset.dictionary.ntoken,
                          emb_dim=300,
                          dropout=dropW)
    q_emb = QuestionEmbedding(in_dim=300,
                              num_hid=num_hid,
                              nlayers=1,
                              bidirect=False,
                              dropout=dropG,
                              rnn_type='GRU')

    print('v_dim: %d\tq_dim: %d\tnum_hid: %d\t num ans candidates: %d' %
          (dataset.v_dim, q_emb.num_hid, num_hid, dataset.num_ans_candidates))
    v_att_1 = Att_3(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    v_att_2 = Att_3(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    q_net = FCNet([q_emb.num_hid, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)
    v_net = FCNet([dataset.v_dim, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)
    h_net = HNet([1280, 100, 100], [1280, 1280])

    classifier = SimpleClassifier(in_dim=num_hid,
                                  hid_dim=2 * num_hid,
                                  out_dim=dataset.num_ans_candidates,
                                  dropout=dropC,
                                  norm=norm,
                                  act=activation)
    return Model_h(w_emb, q_emb, v_att_1, v_att_2, q_net, v_net, h_net,
                   classifier)
Ejemplo n.º 29
0
def build_baseline0_newatt2(args, num_hid):
    w_emb = WordEmbedding(args.vocab_size, args.ninp, 0.0)
    q_emb = QuestionEmbedding2(args.ninp, num_hid, args.nlayers, True, 0.0)
    h_emb = QuestionEmbedding2(args.ninp, num_hid, args.nlayers, True, 0.0)
    v_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    h_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    qih_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    qhi_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    q_net = FCNet([q_emb.num_hid*2, num_hid*2])
    v_net = FCNet([args.nhid*2, num_hid*2])
    h_net = FCNet([args.nhid*2, num_hid*2])
    qih_net = FCNet([args.nhid*2, num_hid*2])
    qhi_net = FCNet([args.nhid*2, num_hid*2])
    qhih_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    qihi_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)

    decoder = netG(args)
    return BaseModel2(w_emb, q_emb, h_emb, v_att, h_att, q_net, v_net, h_net, qih_att, qhi_att, qih_net, qhi_net,
                     decoder, args, qhih_att, qihi_att)
Ejemplo n.º 30
0
def build_SAN(dataset, args):
    # init word embedding module, question embedding module, and Attention network
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0, args.op)
    q_emb = QuestionEmbedding(300 if "c" not in args.op else 600, args.num_hid,
                              1, False, 0.0, args.rnn)
    v_att = StackedAttention(
        args.num_stacks,
        dataset.v_dim,
        args.num_hid,
        args.num_hid,
        dataset.num_ans_candidates,
        args.dropout,
    )
    # build and load pre-trained MAML model
    if args.maml:
        weight_path = args.RAD_dir + "/" + args.maml_model_path
        print("load initial weights MAML from: %s" % (weight_path))
        maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
    # build and load pre-trained Auto-encoder model
    if args.autoencoder:
        ae_v_emb = Auto_Encoder_Model()
        weight_path = args.RAD_dir + "/" + args.ae_model_path
        print("load initial weights DAE from: %s" % (weight_path))
        ae_v_emb.load_state_dict(torch.load(weight_path, args.map_location))
    # Loading tfidf weighted embedding
    if hasattr(args, "tfidf"):
        w_emb = tfidf_loading(args.tfidf, w_emb, args)
    # init classifier
    classifier = SimpleClassifier(args.num_hid, 2 * args.num_hid,
                                  dataset.num_ans_candidates, args)
    # contruct VQA model and return
    if args.maml and args.autoencoder:
        return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb,
                         ae_v_emb)
    elif args.maml:
        return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb,
                         None)
    elif args.autoencoder:
        return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, ae_v_emb)
    return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, None)