Beispiel #1
0
def build_ban_foil(dataset, num_hid, num_ans_candidates, op='', gamma=4):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False,
                              .0)
    v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma)
    b_net = []
    q_prj = []
    c_prj = []
    objects = 10  # minimum number of boxes
    for i in range(gamma):
        b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1))
        q_prj.append(FCNet([num_hid, num_hid], '', .2))
        c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
    classifier = SimpleClassifierFoil(num_hid, 64, num_ans_candidates)
    counter = Counter(objects)
    return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                    classifier, counter, op, gamma)
Beispiel #2
0
    def __init__(self, vocab_size, embed_hidden=300, mlp_hidden=512):
        super(TopDown, self).__init__()

        self.vocab_size = vocab_size

        self.q_emb = nn.LSTM(embed_hidden,
                             mlp_hidden,
                             batch_first=True,
                             bidirectional=True)
        self.q_prep = FCNet([mlp_hidden, mlp_hidden])
        self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden)
        self.verb_transform = nn.Linear(embed_hidden, mlp_hidden)
        self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden)
        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])
        self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden,
                                           self.vocab_size, 0.5)
    def __init__(self, opt):
        super(UpDn, self).__init__()
        num_hid = opt.num_hid
        activation = opt.activation
        dropG = opt.dropG
        dropW = opt.dropW
        dropout = opt.dropout
        dropL = opt.dropL
        norm = opt.norm
        dropC = opt.dropC
        self.opt = opt
        print(f"ntokens {opt.ntokens}")
        self.w_emb = WordEmbedding(opt.ntokens, emb_dim=300, dropout=dropW)
        self.w_emb.init_embedding(f'{opt.data_dir}/glove6b_init_300d.npy')
        # self.q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1,
        #                                bidirect=False, dropout=dropG, rnn_type='GRU')
        self.q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid)

        self.q_net = FCNet([self.q_emb.num_hid, num_hid],
                           dropout=dropL,
                           norm=norm,
                           act=activation)
        self.gv_net = FCNet([2048, num_hid],
                            dropout=dropL,
                            norm=norm,
                            act=activation)

        self.gv_att_1 = Att_3(v_dim=2048,
                              q_dim=self.q_emb.num_hid,
                              num_hid=num_hid,
                              dropout=dropout,
                              norm=norm,
                              act=activation)
        self.gv_att_2 = Att_3(v_dim=2048,
                              q_dim=self.q_emb.num_hid,
                              num_hid=num_hid,
                              dropout=dropout,
                              norm=norm,
                              act=activation)
        self.classifier = SimpleClassifier(in_dim=num_hid,
                                           hid_dim=2 * num_hid,
                                           out_dim=3129,
                                           dropout=dropC,
                                           norm=norm,
                                           act=activation)
def build_stackdualatt(dataset, num_hid, args):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4)
    q_emb = QuestionEmbedding(300, num_hid, args.rnn_layer, False, 0.4)
    v_att1 = DualAttention(dataset.v_dim, q_emb.num_hid, num_hid, 0.2)
    v_att2 = DualAttention(dataset.v_dim, q_emb.num_hid + num_hid, num_hid,
                           0.2)
    v_att3 = DualAttention(dataset.v_dim, q_emb.num_hid + num_hid, num_hid,
                           0.2)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    query_net = FCNet([dataset.v_dim, num_hid])

    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)

    model = BaseModelStackAtt(w_emb, q_emb, (v_att1, v_att2, v_att3), q_net,
                              v_net, query_net, classifier, args)
    return model
Beispiel #5
0
def build_baseline0_newatt(dataset,
                           num_hid,
                           reconstruction,
                           size=64,
                           dropout_hid=0.0,
                           gamma_r=0.0,
                           adv_mode="wgan",
                           logger=None):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid,
                     dataset.v_dim, reconstruction, size, dropout_hid, gamma_r,
                     adv_mode, logger)
Beispiel #6
0
def build_model_A3x2_h(dataset, num_hid, dropout, norm, activation, dropL,
                       dropG, dropW, dropC):
    w_emb = WordEmbedding(dataset.dictionary.ntoken,
                          emb_dim=300,
                          dropout=dropW)
    q_emb = QuestionEmbedding(in_dim=300,
                              num_hid=num_hid,
                              nlayers=1,
                              bidirect=False,
                              dropout=dropG,
                              rnn_type='GRU')

    print('v_dim: %d\tq_dim: %d\tnum_hid: %d\t num ans candidates: %d' %
          (dataset.v_dim, q_emb.num_hid, num_hid, dataset.num_ans_candidates))
    v_att_1 = Att_3(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    v_att_2 = Att_3(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    q_net = FCNet([q_emb.num_hid, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)
    v_net = FCNet([dataset.v_dim, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)
    h_net = HNet([1280, 100, 100], [1280, 1280])

    classifier = SimpleClassifier(in_dim=num_hid,
                                  hid_dim=2 * num_hid,
                                  out_dim=dataset.num_ans_candidates,
                                  dropout=dropC,
                                  norm=norm,
                                  act=activation)
    return Model_h(w_emb, q_emb, v_att_1, v_att_2, q_net, v_net, h_net,
                   classifier)
    def __init__(self, dim,
                 max_step=12, self_attention=False, memory_gate=False,
                 classes=28, dropout=0.15):
        super().__init__()

        self.q_trasform = FCNet([300, dim])
        self.v_trasform = FCNet([dim, dim])
        self.mac = MACUnit(dim, max_step,
                           self_attention, memory_gate, dropout)


        '''self.classifier = nn.Sequential(linear(dim * 2, dim),
                                        nn.ELU(),
                                        nn.Dropout(0.5),
                                        linear(dim, classes))'''
        self.classifier = SimpleClassifier(dim, 2 * dim, classes, 0.5)

        self.max_step = max_step
        self.dim = dim
Beispiel #8
0
def build_baseline0_newatt2(args, num_hid):
    w_emb = WordEmbedding(args.vocab_size, args.ninp, 0.0)
    q_emb = QuestionEmbedding2(args.ninp, num_hid, args.nlayers, True, 0.0)
    h_emb = QuestionEmbedding2(args.ninp, num_hid, args.nlayers, True, 0.0)
    v_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    h_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    qih_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    qhi_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    q_net = FCNet([q_emb.num_hid*2, num_hid*2])
    v_net = FCNet([args.nhid*2, num_hid*2])
    h_net = FCNet([args.nhid*2, num_hid*2])
    qih_net = FCNet([args.nhid*2, num_hid*2])
    qhi_net = FCNet([args.nhid*2, num_hid*2])
    qhih_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)
    qihi_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2)

    decoder = netG(args)
    return BaseModel2(w_emb, q_emb, h_emb, v_att, h_att, q_net, v_net, h_net, qih_att, qhi_att, qih_net, qhi_net,
                     decoder, args, qhih_att, qihi_att)
Beispiel #9
0
def build_ban(dataset, num_hid, op='', gamma=4, q_emb_type='bert', on_do_q=False, finetune_q=False):
    if 'bert' in q_emb_type:
        q_emb = BertModel.from_pretrained('bert-base-multilingual-cased')
        q_dim = 768
    elif 'rg' in q_emb_type:
        w_dim = 100
        q_dim = num_hid
        q_emb = RnnQuestionEmbedding(dataset.dictionary.ntoken, w_dim, q_dim, op)
    elif 'pkb' in q_emb_type:
        w_dim = 200
        q_dim = num_hid
        q_emb = RnnQuestionEmbedding(dataset.dictionary.ntoken, w_dim, q_dim, op)

    if 'bertrnn' in q_emb_type:
        q_emb = BertRnnQuestionEmbedding(q_emb, 200, num_hid, op)
        q_dim = num_hid

    if not finetune_q: # Freeze question embedding
        if isinstance(q_emb, BertModel):
            for p in q_emb.parameters():
                p.requires_grad_(False)
        else:
            for p in q_emb.w_emb.parameters():
                p.requires_grad_(False)
    if not on_do_q: # Remove dropout of question embedding
        for m in q_emb.modules():
            if isinstance(m, nn.Dropout):
                m.p = 0.

    v_att = BiAttention(dataset.v_dim, q_dim, num_hid, gamma)
    b_net = []
    q_prj = []
    c_prj = []
    objects = 10  # minimum number of boxes
    for i in range(gamma):
        b_net.append(BCNet(dataset.v_dim, q_dim, num_hid, None, k=1))
        q_prj.append(FCNet([num_hid, q_dim], '', .2))
        c_prj.append(FCNet([objects + 1, q_dim], 'ReLU', .0))
    classifiers = [SimpleClassifier(q_dim, num_hid * 2, dataset.num_ans_candidates, .5),
                   SimpleClassifier(q_dim, num_hid * 2, 1, .5)]
    counter = Counter(objects)
    return BanModel(dataset, q_emb, v_att, b_net, q_prj, c_prj, classifiers, counter, op, gamma)
Beispiel #10
0
def build_baseline2(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = nn.Linear(dataset.v_dim, 300)
    v_bn = nn.BatchNorm1d(300, momentum=0.01)
    lstm = nn.LSTM(300, num_hid, 1, batch_first=True)
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, lstm, v_bn)
    def __init__(self, embed_hidden=300, mlp_hidden=512):
        super(TopDown, self).__init__()

        self.q_emb = nn.LSTM(embed_hidden,
                             mlp_hidden,
                             batch_first=True,
                             bidirectional=True)
        self.q_prep = FCNet([mlp_hidden, mlp_hidden])
        self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden)
        self.verb_transform = nn.Linear(embed_hidden, mlp_hidden)
        self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden)
    def __init__(
            self,
            encoder,
            gpu_mode,
            embed_hidden=300,
            mlp_hidden=512
    ):
        super(E2ENetwork, self).__init__()

        self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

        self.train_transform = tv.transforms.Compose([
            tv.transforms.RandomRotation(10),
            tv.transforms.RandomResizedCrop(224),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.dev_transform = tv.transforms.Compose([
            tv.transforms.Resize(224),
            tv.transforms.CenterCrop(224),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        self.n_roles = self.encoder.get_num_roles()
        self.n_verbs = self.encoder.get_num_verbs()
        self.vocab_size = self.encoder.get_num_labels()
        self.max_role_count = self.encoder.get_max_role_count()

        self.conv = vgg16_modified()

        self.verb = nn.Sequential(
            linear(mlp_hidden*8, mlp_hidden*2),
            nn.BatchNorm1d(mlp_hidden*2),
            nn.ReLU(),
            nn.Dropout(0.5),
            linear(mlp_hidden*2, self.n_verbs),
        )

        self.frcnn_lower = FCNet([mlp_hidden*4, mlp_hidden])
        #todo: init embedding
        self.role_lookup = nn.Embedding(self.n_roles+1, embed_hidden, padding_idx=self.n_roles)
        self.verb_lookup = nn.Embedding(self.n_verbs, embed_hidden)

        self.role_labeller = MACNetwork(mlp_hidden, max_step=4, self_attention=False, memory_gate=False,
                                        classes=self.vocab_size)

        self.conv_hidden = self.conv.base_size()
        self.mlp_hidden = mlp_hidden
        self.embed_hidden = embed_hidden
 def __init__(self, h, d_model, dropout=0.1):
     "Take in model size and number of heads."
     super(MultiHeadedAttention, self).__init__()
     assert d_model % h == 0
     # We assume d_v always equals d_k
     self.d_k = d_model // h
     self.h = h
     self.linears = clones(nn.Linear(d_model, d_model), 3)
     self.attn = None
     self.dropout = nn.Dropout(p=dropout)
     self.rep = FCNet([d_model, d_model])
Beispiel #14
0
    def __init__(self, v_dim, q_dim, h_dim, h_out, act='ReLU', dropout=[.2,.5], k=3):
        super(BCNet, self).__init__()
        
        self.c = 32
        self.k = k
        self.v_dim = v_dim; self.q_dim = q_dim
        self.h_dim = h_dim; self.h_out = h_out

        self.v_net = FCNet([v_dim, h_dim * self.k], act=act, dropout=dropout[0])
        self.q_net = FCNet([q_dim, h_dim * self.k], act=act, dropout=dropout[0])
        self.dropout = nn.Dropout(dropout[1]) # attention
        if 1 < k:
            self.p_net = nn.AvgPool1d(self.k, stride=self.k)
        
        if None == h_out:
            pass
        elif h_out <= self.c:
            self.h_mat = nn.Parameter(torch.Tensor(1, h_out, 1, h_dim * self.k).normal_())
            self.h_bias = nn.Parameter(torch.Tensor(1, h_out, 1, 1).normal_())
        else:
            self.h_net = weight_norm(nn.Linear(h_dim * self.k, h_out), dim=None)
 def __init__(self,
              v_dim,
              q_dim,
              num_hid,
              norm,
              act,
              bidirect=False,
              dropout=0.0):
     super(Base_Att, self).__init__()
     norm_layer = get_norm(norm)
     if (bidirect is False):
         self.nonlinear = FCNet([v_dim + q_dim, num_hid],
                                dropout=dropout,
                                norm=norm,
                                act=act)
     else:
         self.nonlinear = FCNet([v_dim + 2 * q_dim, num_hid],
                                dropout=dropout,
                                norm=norm,
                                act=act)
     self.linear = norm_layer(nn.Linear(num_hid, 1), dim=None)
Beispiel #16
0
def build_model_APx2(dataset, num_hid, dropout, norm, activation, dropL, dropG,
                     dropW, dropC):
    w_emb = WordEmbedding(dataset.dictionary.ntoken,
                          emb_dim=300,
                          dropout=dropW)
    q_emb = QuestionEmbedding(in_dim=300,
                              num_hid=num_hid,
                              nlayers=1,
                              bidirect=False,
                              dropout=dropG,
                              rnn_type='GRU')

    v_att_1 = Att_P(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    v_att_2 = Att_P(v_dim=dataset.v_dim,
                    q_dim=q_emb.num_hid,
                    num_hid=num_hid,
                    dropout=dropout,
                    norm=norm,
                    act=activation)
    q_net = FCNet([q_emb.num_hid, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)
    v_net = FCNet([dataset.v_dim, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)

    classifier = SimpleClassifier(in_dim=num_hid,
                                  hid_dim=2 * num_hid,
                                  out_dim=dataset.num_ans_candidates,
                                  dropout=dropC,
                                  norm=norm,
                                  act=activation)
    return Model_2(w_emb, q_emb, v_att_1, v_att_2, q_net, v_net, classifier)
def build_fine(dataset, num_hid, args):

    cnn = getattr(resnet, args.cnn_model)()
    cnn.load_state_dict(
        torch.load(os.path.join(args.model_root, args.cnn_model + '.pth')))
    my_cnn = myResnet(cnn)

    for param in my_cnn.parameters():
        param.requires_grad = False
    for param in my_cnn.resnet.layer4.parameters():
        param.requires_grad = True

    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.4)
    v_att = DualAttention(dataset.v_dim, q_emb.num_hid, num_hid, 0.2)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)

    model = BaseModelWithCNN(w_emb, q_emb, v_att, q_net, v_net, classifier,
                             my_cnn, args)
    return model
Beispiel #18
0
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa'):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False,
                              .0)
    v_att = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2)
    q_net = FCNet([q_emb.num_hid, num_hid], 'Sigmoid')
    v_net = FCNet([dataset.v_dim, num_hid])
    if task == 'vqa':
        b_net = []
        q_prj = []
        c_prj = []
        objects = 10  # minimum number of boxes
        for i in range(gamma):
            b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1))
            q_prj.append(FCNet([num_hid, num_hid], '', .2))
            c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
        classifier = SimpleClassifier(num_hid, num_hid * 2,
                                      dataset.num_ans_candidates, .5)
        counter = Counter(objects)
        return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                        q_net, v_net, classifier, counter, op, gamma)
    elif task == 'flickr':
        return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
Beispiel #19
0
    def __init__(self,
                 v_dim,
                 q_dim,
                 a_dim,
                 h_dim,
                 h_out,
                 rank,
                 glimpse,
                 act='ReLU',
                 dropout=[.2, .5],
                 k=1):
        super(TCNet, self).__init__()

        self.v_dim = v_dim
        self.q_dim = q_dim
        self.a_dim = a_dim
        self.h_out = h_out
        self.rank = rank
        self.h_dim = h_dim * k
        self.hv_dim = int(h_dim / rank)
        self.hq_dim = int(h_dim / rank)
        self.ha_dim = int(h_dim / rank)

        self.v_tucker = FCNet([v_dim, self.h_dim], act=act, dropout=dropout[1])
        self.q_tucker = FCNet([q_dim, self.h_dim], act=act, dropout=dropout[0])
        self.a_tucker = FCNet([a_dim, self.h_dim], act=act, dropout=dropout[0])

        if self.h_dim < 1024:
            self.a_tucker = FCNet([a_dim, self.h_dim],
                                  act=act,
                                  dropout=dropout[0])
            self.v_net = nn.ModuleList([
                FCNet([self.h_dim, self.hv_dim], act=act, dropout=dropout[1])
                for _ in range(rank)
            ])
            self.q_net = nn.ModuleList([
                FCNet([self.h_dim, self.hq_dim], act=act, dropout=dropout[0])
                for _ in range(rank)
            ])
            self.a_net = nn.ModuleList([
                FCNet([self.h_dim, self.ha_dim], act=act, dropout=dropout[0])
                for _ in range(rank)
            ])

            if h_out > 1:
                self.ho_dim = int(h_out / rank)
                h_out = self.ho_dim

            self.T_g = nn.Parameter(
                torch.Tensor(1, rank, self.hv_dim, self.hq_dim, self.ha_dim,
                             glimpse, h_out).normal_())
        self.dropout = nn.Dropout(dropout[1])
Beispiel #20
0
def build_baseline0_newatt(dataset,
                           num_hid,
                           reconstruction,
                           layer=4,
                           size=64,
                           variant='',
                           finetune=False,
                           use_residual=False,
                           use_feat_loss=False,
                           dropout_hid=False,
                           dropout_unet=False,
                           logger=None):

    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid,
                     dataset.v_dim, reconstruction, layer, size, variant,
                     finetune, use_residual, use_feat_loss, dropout_hid,
                     dropout_unet, logger)
Beispiel #21
0
    def __init__(self,
                 encoder,
                 qword_embeddings,
                 label_embedding,
                 vocab_size,
                 gpu_mode,
                 mlp_hidden=512,
                 embed_hidden=300):
        super(ImSituationHandler, self).__init__()

        self.encoder = encoder
        self.qword_embeddings = qword_embeddings
        self.label_embedding = label_embedding
        self.vocab_size = vocab_size
        self.gpu_mode = gpu_mode
        self.img_q_handler = RoleQHandler()
        self.img_label_handler = LabelHandler(self.label_embedding)
        self.q_label_handler = Role2LabelHandler(self.label_embedding)
        self.c_net = FCNet([mlp_hidden * 3, mlp_hidden])
        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.a_net = FCNet([embed_hidden, mlp_hidden])
        self.sim_scorer = SimpleClassifier(mlp_hidden, mlp_hidden, 1, 0.5)

        self.mlp_hidden = mlp_hidden
    def __init__(self, dim, self_attention=False, memory_gate=False):
        super().__init__()

        self.concat = FCNet([dim*2, dim])
        #self.rnn = nn.GRUCell(dim, dim)

        if self_attention:
            self.attn = linear(dim, 1)
            self.mem = linear(dim, dim)

        if memory_gate:
            self.control = linear(dim, 1)

        self.self_attention = self_attention
        self.memory_gate = memory_gate
        self.dim = dim
Beispiel #23
0
    def __init__(self, v_dim, q_dim, num_hid, dropout=0.2):
        super(DualAttention, self).__init__()

        self.v_proj1 = FCNet([v_dim, num_hid])
        self.v_proj2 = FCNet([v_dim, num_hid])
        self.q_proj1 = FCNet([q_dim, num_hid])
        self.q_proj2 = FCNet([q_dim, num_hid])
        self.dropout = nn.Dropout(dropout)
        self.on_repr1 = FCNet([num_hid, num_hid])
        self.on_repr2 = FCNet([num_hid, num_hid])
        self.linear1 = weight_norm(nn.Linear(num_hid, 1), dim=None)
        self.linear2 = weight_norm(nn.Linear(num_hid, 1), dim=None)
Beispiel #24
0
 def __init__(self, v_dim, q_dim, num_hid):
     super(StackAttention1, self).__init__()
     # in this case we have q_dim = v_dim = num_hidden = 1024
     self.input_size = v_dim
     self.fc_q1 = FCNet([q_dim, 768])
     self.fc_q2 = FCNet([768, 640])
     self.fc_v1 = FCNet([v_dim, 768])
     self.fc_v2 = FCNet([768, 640])
     self.att_size = 512
     self.linear1 = FCNet([640, self.att_size])
     self.fc_vq1 = FCNet([self.att_size, 1])
     self.tan = nn.Tanh()
     self.dp = nn.Dropout(0.5)
Beispiel #25
0
    def __init__(self,
                 c_dim,
                 num_hid,
                 q_dim,
                 nlayers,
                 bidirect,
                 dropout,
                 rnn_type='LSTM',
                 v_dim=2048):
        """Module for question embedding
        """
        super(CaptionQuestionImageRNN0, self).__init__()
        assert rnn_type == 'LSTM' or rnn_type == 'GRU'
        rnn_cls = nn.LSTM if rnn_type == 'LSTM' else nn.GRU
        norm_layer = get_norm('weight')
        self.rnn_att = rnn_cls(c_dim,
                               num_hid,
                               nlayers,
                               bidirectional=bidirect,
                               dropout=dropout,
                               batch_first=True)

        self.rnn_c = rnn_cls(c_dim,
                             num_hid,
                             nlayers,
                             bidirectional=bidirect,
                             dropout=dropout,
                             batch_first=True)

        self.v_emb_for_c = FCNet([v_dim, num_hid],
                                 dropout=0.2,
                                 norm='weight',
                                 act='LeakyReLU')
        self.v_att_logits = norm_layer(nn.Linear(num_hid, 1), dim=None)

        self.Sig = nn.Sigmoid()
        self.c_dim = c_dim
        self.q_dim = q_dim
        self.num_hid = num_hid
        self.nlayers = nlayers
        self.ndirections = int(bidirect) + 1
        self.rnn_type = rnn_type
        self.v_dim = v_dim
Beispiel #26
0
    def __init__(self,
                 encoder,
                 gpu_mode,
                 conv_hidden=24,
                 embed_hidden=300,
                 lstm_hidden=300,
                 mlp_hidden=512):
        super(BaseModel, self).__init__()

        self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                 std=[0.229, 0.224, 0.225])

        self.train_transform = tv.transforms.Compose([
            tv.transforms.RandomRotation(10),
            tv.transforms.RandomResizedCrop(224),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.dev_transform = tv.transforms.Compose([
            tv.transforms.Resize(224),
            tv.transforms.CenterCrop(224),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        self.vocab_size = self.encoder.get_num_labels()

        self.conv = vgg16_modified()

        self.agent = nn.Sequential(
            nn.Linear(mlp_hidden * 8, mlp_hidden * 2),
            nn.BatchNorm1d(mlp_hidden * 2),
            nn.ReLU(),
            nn.Dropout(0.5),
        )

        self.oh_to_comp = FCNet(
            [385, mlp_hidden, mlp_hidden, mlp_hidden, mlp_hidden, mlp_hidden])
        self.final_layer = nn.Linear(mlp_hidden * 3, self.vocab_size)
Beispiel #27
0
    def __init__(self, hidden_dim=1024, dropout_=0.2, **kwargs):
        super(AttentionTextEmbedding, self).__init__()

        self.text_out_dim = hidden_dim * kwargs["conv2_out"]

        self.dropout = nn.Dropout(p=dropout_)

        conv1_out = kwargs["conv1_out"]
        conv2_out = kwargs["conv2_out"]

        self.transform = FCNet([hidden_dim, conv1_out], dropout=0.2)

        layers = [
            nn.Dropout(0.2, inplace=False),
            weight_norm(nn.Linear(conv1_out, conv2_out), dim=None)
        ]

        # self.proj = nn.Linear(conv2_out * hidden_dim, self.text_out_dim)
        self.atte = nn.Sequential(*layers)
Beispiel #28
0
def build_vqae3_split(dataset, num_hid, att_dim, dec_dim):
    w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att_1 = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net_1 = FCNet([q_emb.num_hid, num_hid])
    v_net_1 = FCNet([dataset.v_dim, num_hid])
    v_att_2 = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net_2 = FCNet([q_emb.num_hid, num_hid])
    v_net_2 = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    generator = STDecoder(
        dataset.v_dim, num_hid, 300, dec_dim,\
        dataset.explanation_dictionary.ntoken, 1, 0.5)
    e_emb = ExplainEmbedding(generator.embed, 300, num_hid, 1, False, 0.0,
                             'GRU')
    T_vq = FCNet([num_hid, num_hid])
    T_e = FCNet([e_emb.num_hid, num_hid])
    return Split_VQAE(w_emb, q_emb, v_att_1, q_net_1, v_net_1, v_att_2,
                      q_net_2, v_net_2, classifier, generator, e_emb, T_vq,
                      T_e)
Beispiel #29
0
 def __init__(self, v_dim, q_dim, num_hid):
     super(Attention, self).__init__()
     self.nonlinear = FCNet([v_dim + q_dim, num_hid])
     self.linear = weight_norm(nn.Linear(num_hid, 1), dim=None)
Beispiel #30
0
    def __init__(self,
                 encoder,
                 gpu_mode,
                 conv_hidden=24,
                 embed_hidden=300,
                 lstm_hidden=300,
                 mlp_hidden=512):
        super(BaseModel, self).__init__()

        self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                 std=[0.229, 0.224, 0.225])

        self.train_transform = tv.transforms.Compose([
            tv.transforms.RandomRotation(10),
            tv.transforms.RandomResizedCrop(224),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.dev_transform = tv.transforms.Compose([
            tv.transforms.Resize(224),
            tv.transforms.CenterCrop(224),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        #self.vocab_size = self.encoder.get_num_labels()
        self.n_verbs = self.encoder.get_num_verbs()
        self.vocab_size = self.encoder.get_num_labels()

        self.q_word_count = len(self.encoder.question_words)

        self.conv_agent = vgg16_modified()
        self.conv_verb = vgg16_modified()

        self.agent = nn.Sequential(nn.Linear(mlp_hidden * 8, mlp_hidden * 2),
                                   nn.BatchNorm1d(mlp_hidden * 2), nn.ReLU(),
                                   nn.Dropout(0.5),
                                   nn.Linear(mlp_hidden * 2, self.vocab_size))

        self.proj_obj = nn.Sequential(nn.Linear(mlp_hidden * 2, mlp_hidden), )

        self.whole_img = nn.Sequential(
            nn.Linear(mlp_hidden * 8, mlp_hidden * 2),
            nn.BatchNorm1d(mlp_hidden * 2), nn.ReLU(),
            nn.Linear(mlp_hidden * 2, mlp_hidden), nn.Tanh())

        self.agent4img = nn.Sequential(nn.Linear(mlp_hidden * 2, mlp_hidden),
                                       nn.Tanh())

        self.verb = TopDown()
        self.w_emb = nn.Embedding(self.q_word_count, embed_hidden)
        self.agent_label_lookup = nn.Embedding(self.vocab_size, embed_hidden)
        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])

        self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden,
                                           self.n_verbs, 0.5)