Beispiel #1
0
def build_baseline(embeddings, num_ans_candidates):
    vision_features = config.output_features
    visual_glimpses = config.visual_glimpses
    question_features = hidden_features = config.hid_dim
    w_emb = WordEmbedding(embeddings, dropout=0.0)

    q_emb = QuestionEmbedding(w_dim=300,
                              hid_dim=question_features,
                              nlayers=1,
                              bidirect=False,
                              dropout=0.0)

    v_att = Attention(
        v_dim=vision_features,
        q_dim=question_features * q_emb.ndirections,
        hid_dim=hidden_features,
        glimpses=visual_glimpses,
    )

    classifier = SimpleClassifier(
        in_dim=(question_features * q_emb.ndirections, vision_features),
        hid_dim=(hidden_features, hidden_features * 2),
        out_dim=num_ans_candidates,
        dropout=0.5)
    return BaseModel(w_emb, q_emb, v_att, classifier)
Beispiel #2
0
def baseline(args, dataset, pretrained=False):

    # initialise model
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, args.num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, args.num_hid)
    q_net = FCNet([args.num_hid, args.num_hid])
    v_net = FCNet([dataset.v_dim, args.num_hid])
    classifier = SimpleClassifier(args.num_hid, 2 * args.num_hid,
                                  dataset.num_ans_candidates, 0.5)
    model = BaseModel(args, w_emb, q_emb, v_att, q_net, v_net, classifier)

    # load model on device if available
    map_location = None
    if not model.cuda_available:
        map_location = torch.device('cpu')

    # download and load pretrained model
    if pretrained:
        key = 'baseline-vqa'
        url = pretrained_urls[key]
        model.load_state_dict(download_model(
            key, url, map_location=map_location)['model'],
                              strict=False)
    else:
        key = 'untrained'

    # set model name
    model.name = key

    return model
Beispiel #3
0
def build_regat(dataset, args):
    print("Building ReGAT model with %s relation and %s fusion method" %
          (args.relation_type, args.fusion))
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.op)
    q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid,
                              1, False, .0)
    q_att = QuestionSelfAttention(args.num_hid, .2)

    if args.relation_type == "semantic":
        v_relation = ExplicitRelationEncoder(
            dataset.v_dim,
            args.num_hid,
            args.relation_dim,
            args.dir_num,
            args.sem_label_num,
            num_heads=args.num_heads,
            num_steps=args.num_steps,
            nongt_dim=args.nongt_dim,
            residual_connection=args.residual_connection,
            label_bias=args.label_bias)
    elif args.relation_type == "spatial":
        v_relation = ExplicitRelationEncoder(
            dataset.v_dim,
            args.num_hid,
            args.relation_dim,
            args.dir_num,
            args.spa_label_num,
            num_heads=args.num_heads,
            num_steps=args.num_steps,
            nongt_dim=args.nongt_dim,
            residual_connection=args.residual_connection,
            label_bias=args.label_bias)
    else:
        v_relation = ImplicitRelationEncoder(
            dataset.v_dim,
            args.num_hid,
            args.relation_dim,
            args.dir_num,
            args.imp_pos_emb_dim,
            args.nongt_dim,
            num_heads=args.num_heads,
            num_steps=args.num_steps,
            residual_connection=args.residual_connection,
            label_bias=args.label_bias)

    classifier = SimpleClassifier(args.num_hid, args.num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    gamma = 0
    if args.fusion == "ban":
        joint_embedding = BAN(args.relation_dim, args.num_hid, args.ban_gamma)
        gamma = args.ban_gamma
    elif args.fusion == "butd":
        joint_embedding = BUTD(args.relation_dim, args.num_hid, args.num_hid)
    else:
        joint_embedding = MuTAN(args.relation_dim, args.num_hid,
                                dataset.num_ans_candidates, args.mutan_gamma)
        gamma = args.mutan_gamma
        classifier = None
    return ReGAT(dataset, w_emb, q_emb, q_att, v_relation, joint_embedding,
                 classifier, gamma, args.fusion, args.relation_type)
Beispiel #4
0
def build_baseline_with_onestep(embeddings,
                                num_ans_candidates,
                                debias_mode='LearnedMixin'):
    assert debias_mode in [
        'BiasProduct', 'ReweightByInvBias', 'LearnedMixin', 'Plain'
    ]
    vision_features = config.output_features
    visual_glimpses = config.visual_glimpses
    hidden_features = config.hid_dim
    question_features = config.hid_dim
    w_emb = WordEmbedding(embeddings, dropout=0.0)
    q_emb = QuestionEmbedding(w_dim=300,
                              hid_dim=question_features,
                              nlayers=1,
                              bidirect=False,
                              dropout=0.0)

    v_att = attention.Attention(
        v_dim=vision_features,
        q_dim=question_features,
        hid_dim=hidden_features,
        glimpses=visual_glimpses,
    )

    classifier = SimpleClassifier(in_dim=(question_features,
                                          visual_glimpses * vision_features),
                                  hid_dim=(hidden_features,
                                           hidden_features * 2),
                                  out_dim=num_ans_candidates,
                                  dropout=0.5)

    # mask_v_att = attention.Attention(
    #     v_dim=vision_features,
    #     q_dim=question_features,
    #     hid_dim=hidden_features,
    #     glimpses=visual_glimpses,
    # )
    #
    # mask_classifier = SimpleClassifier(
    #     in_dim=(question_features, vision_features),
    #     hid_dim=(hidden_features, hidden_features * 2),
    #     out_dim=num_ans_candidates,
    #     dropout=0.5
    # )
    # Add the loss_fn based our arguments
    debias_loss_fn = eval(debias_mode)()
    return BaseModel_with_Onestep(w_emb, q_emb, v_att, classifier,
                                  debias_loss_fn)
Beispiel #5
0
def build_baseline_with_twostep(embeddings,
                                num_ans_candidates,
                                debias_mode='LearnedMixin'):
    assert debias_mode in [
        'BiasProduct', 'ReweightByInvBias', 'LearnedMixin', 'Plain'
    ]
    vision_features = config.output_features
    visual_glimpses = config.visual_glimpses
    hidden_features = config.hid_dim
    question_features = config.hid_dim
    w_emb = WordEmbedding(embeddings, dropout=0.0)
    q_emb = QuestionEmbedding(w_dim=300,
                              hid_dim=question_features,
                              nlayers=1,
                              bidirect=False,
                              dropout=0.0)

    v_att = attention.Attention(
        v_dim=vision_features,
        q_dim=question_features,
        hid_dim=hidden_features,
        glimpses=visual_glimpses,
    )

    classifier = SimpleClassifier(in_dim=(question_features,
                                          visual_glimpses * vision_features),
                                  hid_dim=(hidden_features,
                                           hidden_features * 2),
                                  out_dim=num_ans_candidates,
                                  dropout=0.5)

    if config.use_rubi:
        c1 = MLP(
            input_dim=question_features,
            dimensions=[1024, 1024, num_ans_candidates],
        )
        c2 = nn.Linear(num_ans_candidates, num_ans_candidates)
    else:
        c1, c2 = None, None

    # Add the loss_fn based our arguments
    debias_loss_fn = eval(debias_mode)(hidden_features if config.fusion_type ==
                                       'mul' else hidden_features * 2)
    return BaseModel_with_Twostep(w_emb, q_emb, v_att, classifier,
                                  debias_loss_fn, c1, c2)
Beispiel #6
0
def build_regat(dataset, args):
    print("Building ReGAT model with %s relation and %s fusion method" %
          (args.relation_type, args.fusion))
    # 词嵌入向量模型
    # WordEmbedding(
    #   (emb): Embedding(19902, 300, padding_idx=19901)
    #   (emb_): Embedding(19902, 300, padding_idx=19901)
    #   (dropout): Dropout(p=0.0, inplace=False)
    # )
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0,
                          args.op)  #调用WordEmbding词嵌入方法
    #问题嵌入
    #  QuestionEmbedding(
    # (rnn): GRU(600, 1024, batch_first=True) )
    q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid,
                              1, False, .0)
    #问题自注意力
    # QuestionSelfAttention(
    #   (drop): Dropout(p=0.2, inplace=False)
    #   (W1_self_att_q): FCNet(
    #     (main): Sequential(
    #       (0): Dropout(p=0.2, inplace=False)
    #       (1): Linear(in_features=1024, out_features=1024, bias=True)
    #     )
    #   )
    #   (W2_self_att_q): FCNet(
    #     (main): Sequential(
    #       (0): Linear(in_features=1024, out_features=1, bias=True)
    #     )
    #   )
    # )
    q_att = QuestionSelfAttention(args.num_hid, .2)

    if args.relation_type == "semantic":  #如果关系类型是语义的
        v_relation = ExplicitRelationEncoder(
            dataset.v_dim,
            args.num_hid,
            args.relation_dim,
            args.dir_num,
            args.sem_label_num,
            num_heads=args.num_heads,
            num_steps=args.num_steps,
            nongt_dim=args.nongt_dim,
            residual_connection=args.residual_connection,
            label_bias=args.label_bias)
    elif args.relation_type == "spatial":  #如果关系类型是空间的
        v_relation = ExplicitRelationEncoder(
            dataset.v_dim,
            args.num_hid,
            args.relation_dim,
            args.dir_num,
            args.spa_label_num,
            num_heads=args.num_heads,
            num_steps=args.num_steps,
            nongt_dim=args.nongt_dim,
            residual_connection=args.residual_connection,
            label_bias=args.label_bias)
    else:  #否则是隐式关系
        v_relation = ImplicitRelationEncoder(
            dataset.v_dim,
            args.num_hid,
            args.relation_dim,
            args.dir_num,
            args.imp_pos_emb_dim,
            args.nongt_dim,
            num_heads=args.num_heads,
            num_steps=args.num_steps,
            residual_connection=args.residual_connection,
            label_bias=args.label_bias)
    #分类器
    classifier = SimpleClassifier(args.num_hid, args.num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    gamma = 0
    #采用融合方法
    if args.fusion == "ban":
        joint_embedding = BAN(args.relation_dim, args.num_hid, args.ban_gamma)
        gamma = args.ban_gamma
    elif args.fusion == "butd":
        joint_embedding = BUTD(args.relation_dim, args.num_hid, args.num_hid)
    else:
        joint_embedding = MuTAN(args.relation_dim, args.num_hid,
                                dataset.num_ans_candidates, args.mutan_gamma)
        gamma = args.mutan_gamma
        classifier = None
    return ReGAT(dataset, w_emb, q_emb, q_att, v_relation, joint_embedding,
                 classifier, gamma, args.fusion, args.relation_type)