def __init__(self,
                 encoder,
                 qword_embeddings,
                 verb_lookup,
                 n_roles,
                 vocab_size,
                 gpu_mode,
                 mlp_hidden=512,
                 embd_hidden=300):
        super(ImSituationHandler, self).__init__()


        self.encoder = encoder
        self.qword_embeddings = qword_embeddings
        self.verb_lookup = verb_lookup
        self.role_size = n_roles
        self.vocab_size = vocab_size
        self.gpu_mode = gpu_mode
        self.mlp_hidden = mlp_hidden
        self.embd_hidden = embd_hidden
        self.role_handler = RoleQHandler()
        self.role_predictor = RolePredictor()
        self.noun_q_net = FCNet([mlp_hidden, mlp_hidden])
        self.noun_v_net = FCNet([mlp_hidden, mlp_hidden])
        self.noun_classifier = SimpleClassifier(
            mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5)

        self.role_q_net = FCNet([mlp_hidden, mlp_hidden])
        self.role_v_net = FCNet([mlp_hidden, mlp_hidden])
        self.role_classifier = SimpleClassifier(
            mlp_hidden, 2 * mlp_hidden, self.role_size, 0.5)
def build_DD(dataset,num_hid):
    classifier_V = SimpleClassifier(
        2048, num_hid * 2, dataset.num_ans_candidates, 0.5)#-------!!!!!
    classifier_Q = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
        
    return classifier_V,classifier_Q
Beispiel #3
0
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa'):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False,
                              .0)
    v_att = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    q_att = weight_norm(nn.Linear(num_hid, 1), dim=None)

    w_emb2 = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    q_emb2 = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1,
                               False, .0)
    v_att2 = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2)
    v_net2 = FCNet([dataset.v_dim, num_hid])
    q_att2 = weight_norm(nn.Linear(num_hid, 1), dim=None)

    if task == 'vqa':
        b_net = []
        q_prj = []
        c_prj = []
        objects = 10  # minimum number of boxes
        for i in range(gamma):
            b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1))
            q_prj.append(FCNet([num_hid, num_hid], '', .2))
            c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
        classifier = SimpleClassifier(num_hid, num_hid * 2, 3, .5)
        classifier2 = SimpleClassifier(num_hid, num_hid * 2,
                                       dataset.num_ans_candidates, .5)
        counter = Counter(objects)
        return BanModel(dataset, w_emb, q_emb, v_att, q_att, b_net, q_prj,
                        c_prj, q_net, v_net, classifier, classifier2, counter,
                        op, gamma, w_emb2, q_emb2, v_att2, v_net2, q_att2)
    elif task == 'flickr':
        return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
Beispiel #4
0
    def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden=512):
        super(BaseModel, self).__init__()

        self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                 std=[0.229, 0.224, 0.225])

        self.train_transform = tv.transforms.Compose([
            tv.transforms.RandomRotation(10),
            tv.transforms.RandomResizedCrop(224),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.dev_transform = tv.transforms.Compose([
            tv.transforms.Resize(224),
            tv.transforms.CenterCrop(224),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        self.n_roles = self.encoder.get_num_roles()
        self.n_verbs = self.encoder.get_num_verbs()
        self.vocab_size = self.encoder.get_num_labels()
        self.max_role_count = self.encoder.get_max_role_count()
        self.n_role_q_vocab = len(self.encoder.question_words)

        self.conv = vgg16_modified()
        self.verb_lookup = nn.Embedding(self.n_verbs, embed_hidden)

        self.role_gcn = GCN(nfeat=mlp_hidden, nhid=mlp_hidden, dropout=0.5)

        self.role_classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden,
                                                self.n_roles + 1, 0.5)

        self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden,
                                           self.vocab_size, 0.5)

        self.w_emb = nn.Embedding(self.n_role_q_vocab + 1,
                                  embed_hidden,
                                  padding_idx=self.n_role_q_vocab)
        self.q_emb = nn.LSTM(embed_hidden,
                             mlp_hidden,
                             batch_first=True,
                             bidirectional=True)
        self.q_prep = FCNet([mlp_hidden, mlp_hidden])
        self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden)
        self.verb_transform = nn.Linear(embed_hidden, mlp_hidden)
        self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden)
        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])

        self.conv_hidden = self.conv.base_size()
        self.mlp_hidden = mlp_hidden
        self.embed_hidden = embed_hidden
Beispiel #5
0
    def __init__(self, encoder,
                 gpu_mode,
                 embed_hidden=300,
                 mlp_hidden=512):
        super(BaseModel, self).__init__()

        self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

        self.train_transform = tv.transforms.Compose([
            tv.transforms.RandomRotation(10),
            tv.transforms.RandomResizedCrop(224),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.dev_transform = tv.transforms.Compose([
            tv.transforms.Resize(224),
            tv.transforms.CenterCrop(224),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        self.n_roles = self.encoder.get_num_roles()
        self.n_verbs = self.encoder.get_num_verbs()
        self.vocab_size = self.encoder.get_num_labels()
        self.agent_vocab_size = len(self.encoder.agent_label_list)
        self.place_vocab_size = len(self.encoder.place_label_list)
        self.max_role_count = self.encoder.get_max_role_count()
        self.n_role_q_vocab = len(self.encoder.question_words)

        self.conv = vgg16_modified()
        self.verb_lookup = nn.Embedding(self.n_verbs, embed_hidden)
        self.w_emb = nn.Embedding(self.n_role_q_vocab + 1, embed_hidden, padding_idx=self.n_role_q_vocab)
        self.q_emb1 = nn.LSTM(embed_hidden, mlp_hidden,
                              batch_first=True, bidirectional=True)
        self.lstm_proj1 = nn.Linear(mlp_hidden * 2, mlp_hidden)
        self.q_emb2 = nn.LSTM(mlp_hidden, mlp_hidden,
                              batch_first=True, bidirectional=True)
        self.lstm_proj2 = nn.Linear(mlp_hidden * 2, mlp_hidden)
        self.roles = TopDown(self.vocab_size)
        self.other_classifier = SimpleClassifier(
            mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5)
        self.agent_classifier = SimpleClassifier(
            mlp_hidden, 2 * mlp_hidden, self.agent_vocab_size, 0.5)
        self.place_classifier = SimpleClassifier(
            mlp_hidden, 2 * mlp_hidden, self.place_vocab_size, 0.5)

        self.conv_hidden = self.conv.base_size()
        self.mlp_hidden = mlp_hidden
        self.embed_hidden = embed_hidden
        self.dropout = nn.Dropout(0.3)
        self.num_steps = 3
def build_LL_newatt(dataset, num_hid):#---------------------------------!!!!!!
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier_LL = SimpleClassifier(#--------!!!!!
        num_hid*4, num_hid /8, 1, 0.5)#-----!!!!!
    classifier_All = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    return LL4ALModel(w_emb, q_emb, v_att, q_net, v_net, classifier_LL,classifier_All)
def build_multimodal_newatt(dataset, num_hid):#---------------------------------!!!!!!
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier_V = SimpleClassifier(
        2048, num_hid * 2, dataset.num_ans_candidates, 0.5)#-------!!!!!
    classifier_Q = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    classifier_All = SimpleClassifier(
        num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5)
    return MultiModalModel(w_emb, q_emb, v_att, q_net, v_net, classifier_V,classifier_Q,classifier_All)
    def __init__(self, dataset, args):
        super(BAN_Model, self).__init__()

        self.args = args
        # init word embedding module, question embedding module, biAttention network, bi_residual network, and classifier
        self.w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0,
                                   args.cat)
        self.q_emb = QuestionEmbedding(600 if args.cat else 300, args.hid_dim,
                                       1, False, .0, args.rnn)

        # for close att+ resnet + classify
        self.close_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim,
                                     args.glimpse)
        self.close_resnet = BiResNet(args, dataset)
        self.close_classifier = SimpleClassifier(args.hid_dim,
                                                 args.hid_dim * 2,
                                                 dataset.num_close_candidates,
                                                 args)

        # for open_att + resnet + classify
        self.open_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim,
                                    args.glimpse)
        self.open_resnet = BiResNet(args, dataset)
        self.open_classifier = SimpleClassifier(args.hid_dim, args.hid_dim * 2,
                                                dataset.num_open_candidates,
                                                args)

        # type attention: b * 1024
        self.typeatt = typeAttention(dataset.dictionary.ntoken,
                                     './data/glove6b_init_300d.npy')

        # build and load pre-trained MAML model
        if args.maml:
            weight_path = args.data_dir + '/' + args.maml_model_path
            print('load initial weights MAML from: %s' % (weight_path))
            self.maml = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
        # build and load pre-trained Auto-encoder model
        if args.autoencoder:
            self.ae = Auto_Encoder_Model()
            weight_path = args.data_dir + '/' + args.ae_model_path
            print('load initial weights DAE from: %s' % (weight_path))
            self.ae.load_state_dict(torch.load(weight_path))
            self.convert = nn.Linear(16384, 64)
        # Loading tfidf weighted embedding
        if hasattr(args, 'tfidf'):
            self.w_emb = tfidf_loading(args.tfidf, self.w_emb, args)

        # Loading the other net
        if args.other_model:
            pass
Beispiel #9
0
def build_CCB_model(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_ct_net = FCNet([q_emb.num_hid, num_hid])
    q_cx_net = FCNet([q_emb.num_hid, num_hid])
    v_ct_net = FCNet([dataset.v_dim, num_hid])
    v_cx_net = FCNet([dataset.v_dim, num_hid])
    classifier_fq = SimpleClassifier(num_hid, num_hid * 2,
                                     dataset.num_ans_candidates, 0.5)
    classifier_vq = SimpleClassifier(num_hid, num_hid * 2,
                                     dataset.num_ans_candidates, 0.5)
    return CCB_Model(w_emb, q_emb, v_att, q_ct_net, q_cx_net, v_ct_net,
                     classifier_fq, classifier_vq, v_cx_net)
Beispiel #10
0
def classify_species(s0, s1, ratio):
    sc = SimpleClassifier()
    iris, names = load_iris_data(s0, s1, ratio)
    sc.add_data(*iris.training)
    sc.train()
    valr = np.array([sc.classify(x) for x in iris.training[0]])
    val = np.array([sc.classify(x) for x in iris.validation[0]])
    return risk(valr, iris.training[1]), risk(val, iris.validation[1])
Beispiel #11
0
    def __init__(self, opt):
        super(Model, self).__init__()
        self.dictionary = Dictionary.load_from_file(opt.dataroot +
                                                    'dictionary.pkl')
        num_hid = 128
        activation = opt.activation
        dropG = opt.dropG
        dropW = opt.dropW
        dropout = opt.dropout
        dropL = opt.dropL
        norm = opt.norm
        dropC = opt.dropC
        self.opt = opt

        self.w_emb = WordEmbedding(opt.ntokens, emb_dim=300, dropout=dropW)
        self.w_emb.init_embedding(opt.dataroot + 'glove6b_init_300d.npy')
        self.q_emb = QuestionEmbedding(in_dim=300,
                                       num_hid=num_hid,
                                       nlayers=1,
                                       bidirect=False,
                                       dropout=dropG,
                                       rnn_type='GRU')
        self.q_net = FCNet([self.q_emb.num_hid, num_hid],
                           dropout=dropL,
                           norm=norm,
                           act=activation)
        self.classifier = SimpleClassifier(
            in_dim=num_hid,
            hid_dim=num_hid // 2,
            out_dim=2,  #opt.test_candi_ans_num,
            dropout=dropC,
            norm=norm,
            act=activation)
        self.normal = nn.BatchNorm1d(num_hid, affine=False)
    def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden=512):
        super(role_module, self).__init__()

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        self.n_roles = self.encoder.get_num_roles()
        self.n_verbs = self.encoder.get_num_verbs()
        self.vocab_size = self.encoder.get_num_labels()
        self.max_role_count = self.encoder.get_max_role_count()
        self.n_role_q_vocab = len(self.encoder.question_words)

        self.conv = vgg16_modified()
        self.verb_lookup = nn.Embedding(self.n_verbs, embed_hidden)
        self.w_emb = nn.Embedding(self.n_role_q_vocab + 1,
                                  embed_hidden,
                                  padding_idx=self.n_role_q_vocab)
        self.q_emb = nn.LSTM(embed_hidden,
                             mlp_hidden,
                             batch_first=True,
                             bidirectional=True)
        self.q_prep = FCNet([mlp_hidden, mlp_hidden])
        self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden)
        self.verb_transform = nn.Linear(embed_hidden, mlp_hidden)
        self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden)
        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])
        self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden,
                                           self.vocab_size, 0.5)

        self.conv_hidden = self.conv.base_size()
        self.mlp_hidden = mlp_hidden
        self.embed_hidden = embed_hidden
    def __init__(self,
                 n_vocab,
                 dim,
                 embed_hidden=300,
                 max_step=12,
                 self_attention=False,
                 memory_gate=False,
                 classes=28,
                 dropout=0.15):
        super().__init__()

        #print('embed vocab size :', n_vocab)
        self.embed = nn.Embedding(n_vocab + 1, embed_hidden)
        self.lstm = nn.LSTM(embed_hidden,
                            dim,
                            batch_first=True,
                            bidirectional=True)
        self.lstm_proj = linear(dim * 2, dim)
        self.full_q_lstm_proj = linear(dim * 2, dim)
        self.mac = MACUnit(dim, max_step, self_attention, memory_gate, dropout)

        self.q_net = FCNet([dim, dim])
        self.v_net = FCNet([dim, dim])
        self.classifier = SimpleClassifier(dim, 2 * dim, classes, 0.5)

        self.max_step = max_step
        self.dim = dim

        self.reset()
Beispiel #14
0
    def __init__(self, opt):
        super(Model, self).__init__()
        num_hid = opt.num_hid
        activation = opt.activation
        dropG = opt.dropG
        dropW = opt.dropW
        dropout = opt.dropout
        dropL = opt.dropL
        norm = opt.norm
        dropC = opt.dropC
        self.opt = opt

        self.w_emb = WordEmbedding(opt.ntokens, emb_dim=300, dropout=dropW)
        self.w_emb.init_embedding(opt.dataroot + 'glove6b_init_300d.npy')
        self.q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1,
                                       bidirect=False, dropout=dropG, rnn_type='GRU')

        self.q_net = FCNet([self.q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation)
        self.gv_net = FCNet([opt.v_dim, num_hid], dropout=dropL, norm=norm, act=activation)

        self.gv_att_1 = Att_3(v_dim=opt.v_dim, q_dim=self.q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm,
                              act=activation)
        self.gv_att_2 = Att_3(v_dim=opt.v_dim, q_dim=self.q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm,
                              act=activation)
        self.classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=opt.ans_dim,
                                           dropout=dropC, norm=norm, act=activation)

        self.normal = nn.BatchNorm1d(num_hid,affine=False)
Beispiel #15
0
    def __init__(self, args):
        super(_netG, self).__init__()

        self.ninp = args.ninp
        self.nhid = args.nhid
        self.nlayers = args.nlayers
        self.dropout = args.dropout
        self.rnn = getattr(nn, 'LSTM')(self.ninp,
                                       self.nhid,
                                       self.nlayers,
                                       bidirectional=False,
                                       dropout=self.dropout,
                                       batch_first=True)
        self.rnn_type = 'LSTM'

        self.decoder = SimpleClassifier(self.nhid * 2, self.nhid * 4,
                                        args.vocab_size, self.dropout)
        self.d = args.dropout
        self.beta = 3
        self.vocab_size = args.vocab_size
        # self.init_weights()
        self.w_q = nn.Linear(self.nhid * 2, self.nhid)
        self.ans_q = nn.Linear(self.nhid, self.nhid)
        self.Wa_q = nn.Linear(self.nhid, 1)

        self.w_h = nn.Linear(self.nhid * 2, self.nhid)
        self.ans_h = nn.Linear(self.nhid, self.nhid)
        self.Wa_h = nn.Linear(self.nhid, 1)

        self.w_i = nn.Linear(self.nhid * 2, self.nhid)
        self.ans_i = nn.Linear(self.nhid, self.nhid)
        self.Wa_i = nn.Linear(self.nhid, 1)

        self.concat = nn.Linear(self.nhid * 3, self.nhid)
Beispiel #16
0
def build_SAN(dataset, args):
    # init word embedding module, question embedding module, and Attention network
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0, args.op)
    q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid,
                              1, False, 0.0, args.rnn)
    v_att = StackedAttention(args.num_stacks, dataset.v_dim, args.num_hid,
                             args.num_hid, dataset.num_ans_candidates,
                             args.dropout)
    # build and load pre-trained MAML model
    if args.maml:
        weight_path = args.RAD_dir + '/' + args.maml_model_path
        print('load initial weights MAML from: %s' % (weight_path))
        maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn)
    # build and load pre-trained Auto-encoder model
    if args.autoencoder:
        ae_v_emb = Auto_Encoder_Model()
        weight_path = args.RAD_dir + '/' + args.ae_model_path
        print('load initial weights DAE from: %s' % (weight_path))
        ae_v_emb.load_state_dict(torch.load(weight_path))
    # Loading tfidf weighted embedding
    if hasattr(args, 'tfidf'):
        w_emb = tfidf_loading(args.tfidf, w_emb, args)
    # init classifier
    classifier = SimpleClassifier(args.num_hid, 2 * args.num_hid,
                                  dataset.num_ans_candidates, args)
    # contruct VQA model and return
    if args.maml and args.autoencoder:
        return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb,
                         ae_v_emb)
    elif args.maml:
        return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb,
                         None)
    elif args.autoencoder:
        return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, ae_v_emb)
    return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, None)
Beispiel #17
0
    def __init__(self,
                 encoder,
                 gpu_mode,
                 role_lookup,
                 ans_lookup,
                 q_word_embeddings,
                 num_roles,
                 num_labels, mlp_hidden,
                 embd_hidden, vqa_model):
        super(RoleNode, self).__init__()

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        self.role_lookup = role_lookup
        self.ans_lookup = ans_lookup
        self.q_word_embeddings = q_word_embeddings
        self.num_roles = num_roles
        self.num_labels = num_labels
        self.mlp_hidden = mlp_hidden
        self.embd_hidden = embd_hidden
        self.vqa_model = vqa_model
        self.verb_project = nn.Linear(embd_hidden, mlp_hidden)

        self.label_classifier = SimpleClassifier(
            mlp_hidden, 2 * mlp_hidden, self.num_labels + 1, 0.5)
    def __init__(self, encoder,
                 embed_hidden=300,
                 mlp_hidden=512,
                 time_steps=3):
        super(TopDownWithContext, self).__init__()

        self.encoder = encoder
        self.n_verbs = self.encoder.get_num_verbs()
        self.vocab_size = self.encoder.get_num_labels()
        self.max_role_count = self.encoder.get_max_role_count()
        self.n_role_q_vocab = len(self.encoder.question_words)

        self.w_emb = nn.Embedding(self.n_role_q_vocab + 1, embed_hidden, padding_idx=self.n_role_q_vocab)
        self.q_emb = nn.LSTM(embed_hidden, mlp_hidden,
                             batch_first=True, bidirectional=True)
        self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden)
        self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden)
        self.ctx_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden)
        self.context = FCNet([mlp_hidden*2, mlp_hidden])
        #self.role_weight = RoleWeightAttention(mlp_hidden, mlp_hidden, mlp_hidden)
        self.detailedq = FCNet([mlp_hidden*2, mlp_hidden])
        self.concat = FCNet([mlp_hidden*2, mlp_hidden])
        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])
        self.classifier = SimpleClassifier(
            mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5)
        self.time_steps = time_steps
Beispiel #19
0
    def __init__(self,
                 encoder,
                 role_lookup,
                 label_lookup,
                 qword_embeddings,
                 vocab_size,
                 gpu_mode,
                 mlp_hidden=512,
                 emd_hidden=300):
        super(ImSituationHandler, self).__init__()

        self.encoder = encoder
        self.role_lookup = role_lookup
        self.label_lookup = label_lookup
        self.qword_embeddings = qword_embeddings
        self.vocab_size = vocab_size
        self.gpu_mode = gpu_mode
        self.mlp_hidden = mlp_hidden
        self.emd_hidden = emd_hidden
        self.q_emb = nn.LSTM(self.emd_hidden,
                             self.mlp_hidden,
                             batch_first=True,
                             bidirectional=True)
        self.lstm_word_proj = nn.Linear(self.mlp_hidden * 2, self.mlp_hidden)
        self.role_handler = RoleQHandler()
        self.q_net = FCNet([self.mlp_hidden * 2, self.mlp_hidden])
        self.v_net = FCNet([self.mlp_hidden, self.mlp_hidden])
        self.classifier = SimpleClassifier(self.mlp_hidden,
                                           2 * self.mlp_hidden,
                                           self.vocab_size, 0.5)
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa', use_counter=True):
    #w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op)
    #q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0)
    w_emb = AlbertTokenizer.from_pretrained('albert-large-v2')
    q_emb = AlbertModel.from_pretrained('albert-large-v2')
    params_set = set()
    for param in q_emb.parameters():
        params_set.add(param)
        param.requires_grad = False
    v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma)
    if task == 'vqa':
        b_net = []
        q_prj = []
        c_prj = []
        objects = 10  # minimum number of boxes
        for i in range(gamma):
            b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1))
            q_prj.append(FCNet([num_hid, num_hid], '', .2))
            c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
        classifier = SimpleClassifier(num_hid, num_hid * 2,
                                      dataset.num_ans_candidates, .5)
        counter = Counter(objects) if use_counter else None
        return BanModel(dataset, params_set, w_emb, q_emb, v_att, b_net, q_prj,
                        c_prj, classifier, counter, op, gamma)
    elif task == 'flickr':
        return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
Beispiel #21
0
def build_ban(num_token,
              v_dim,
              num_hid,
              num_ans,
              op='',
              gamma=4,
              reasoning=False):
    w_emb = WordEmbedding(num_token, 300, .0, op)
    q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False,
                              .0)
    if not reasoning:
        v_att = BiAttention(v_dim, num_hid, num_hid, gamma)
    else:
        v_att = BiAttention(v_dim, num_hid, num_hid, 1)

    # constructing the model
    b_net = []
    q_prj = []
    c_prj = []
    objects = 36  # minimum number of boxes, originally 10
    for i in range(gamma):
        b_net.append(BCNet(v_dim, num_hid, num_hid, None, k=1))
        q_prj.append(FCNet([num_hid, num_hid], '', .2))
        c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0))
    classifier = SimpleClassifier(num_hid, num_hid * 2, num_ans, .5)
    counter = Counter(objects)
    if not reasoning:
        return BanModel(w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier,
                        counter, op, gamma, num_hid)
    else:
        return BanModel_Reasoning(w_emb, q_emb, v_att, b_net, q_prj, c_prj,
                                  classifier, counter, op, gamma, num_hid)
    def __init__(self,
                 max_role_count,
                 vocab_size,
                 gpu_mode,
                 embed_hidden=300,
                 mlp_hidden=512):
        super(TopDown, self).__init__()

        self.vocab_size = vocab_size
        self.max_role_count = max_role_count
        self.gpu_mode = gpu_mode
        '''self.q_emb = nn.LSTM(embed_hidden, mlp_hidden,
                             batch_first=True, bidirectional=True)
        self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden)'''
        self.q_proj = nn.Sequential(
            nn.Linear(embed_hidden * 2, mlp_hidden),
            nn.ReLU(),
        )
        self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden)
        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])
        self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden,
                                           self.vocab_size, 0.5)

        self.mlp_hidden = mlp_hidden
        self.dropout = nn.Dropout(0.2)
Beispiel #23
0
def build_model_APD(dataset, num_hid, dropout, norm, activation, dropL, dropG,
                    dropW, dropC):
    w_emb = WordEmbedding(dataset.dictionary.ntoken,
                          emb_dim=300,
                          dropout=dropW)
    q_emb = QuestionEmbedding(in_dim=300,
                              num_hid=num_hid,
                              nlayers=1,
                              bidirect=False,
                              dropout=dropG,
                              rnn_type='GRU')

    v_att = Att_PD(v_dim=dataset.v_dim,
                   q_dim=q_emb.num_hid,
                   num_hid=num_hid,
                   dropout=dropout,
                   norm=norm,
                   act=activation)
    q_net = FCNet([q_emb.num_hid, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)
    v_net = FCNet([dataset.v_dim, num_hid],
                  dropout=dropL,
                  norm=norm,
                  act=activation)

    classifier = SimpleClassifier(in_dim=num_hid,
                                  hid_dim=2 * num_hid,
                                  out_dim=dataset.num_ans_candidates,
                                  dropout=dropC,
                                  norm=norm,
                                  act=activation)
    return Model(w_emb, q_emb, v_att, q_net, v_net, classifier)
    def __init__(self,
                 encoder,
                 qword_embeddings,
                 vocab_size,
                 gpu_mode,
                 mlp_hidden=512):
        super(ImSituationHandler, self).__init__()

        self.encoder = encoder
        self.qword_embeddings = qword_embeddings
        self.vocab_size = vocab_size
        self.gpu_mode = gpu_mode
        self.role_handler = RoleQHandler()
        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])
        self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden,
                                           self.vocab_size, 0.5)
        self.mlp_hidden = mlp_hidden

        self.g = nn.Sequential(
            nn.Linear(mlp_hidden * 2, mlp_hidden),
            nn.ReLU(),
            nn.Linear(mlp_hidden, mlp_hidden),
            nn.ReLU(),
            nn.Linear(mlp_hidden, mlp_hidden),
            nn.ReLU(),
            nn.Linear(mlp_hidden, mlp_hidden),
            nn.ReLU(),
        )

        self.f = nn.Sequential(nn.Linear(mlp_hidden, mlp_hidden), nn.ReLU(),
                               nn.Linear(mlp_hidden, mlp_hidden))
Beispiel #25
0
    def __init__(self,
                 encoder,
                 gpu_mode,
                 conv_hidden=24,
                 embed_hidden=300,
                 lstm_hidden=300,
                 mlp_hidden=512):
        super(BaseModel, self).__init__()

        self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                 std=[0.229, 0.224, 0.225])

        self.train_transform = tv.transforms.Compose([
            tv.transforms.RandomRotation(10),
            tv.transforms.RandomResizedCrop(224),
            tv.transforms.RandomHorizontalFlip(),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.dev_transform = tv.transforms.Compose([
            tv.transforms.Resize(224),
            tv.transforms.CenterCrop(224),
            tv.transforms.ToTensor(),
            self.normalize,
        ])

        self.encoder = encoder
        self.gpu_mode = gpu_mode
        self.mlp_hidden = mlp_hidden
        #self.vocab_size = self.encoder.get_num_labels()
        self.n_verbs = self.encoder.get_num_verbs()
        self.vocab_size = self.encoder.get_num_labels()

        self.q_word_count = len(self.encoder.question_words)

        self.conv_agent = vgg16_modified()
        self.conv_verb = vgg16_modified_feat()

        self.agent = nn.Sequential(
            nn.Linear(mlp_hidden * 8, mlp_hidden * 2),
            nn.BatchNorm1d(mlp_hidden * 2),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(mlp_hidden * 2, self.vocab_size),
        )

        self.w_emb = nn.Embedding(self.q_word_count + 1,
                                  embed_hidden,
                                  padding_idx=self.q_word_count)
        self.verb = TopDown()

        self.q_net = FCNet([mlp_hidden, mlp_hidden])
        self.v_net = FCNet([mlp_hidden, mlp_hidden])

        self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden,
                                           self.n_verbs, 0.5)

        self.logits = {}
Beispiel #26
0
def build_baseline0(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid], 0.0, "weight", "ReLU")
    v_net = FCNet([dataset.v_dim, num_hid], 0.0, "weight", "ReLU")
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel0(w_emb, q_emb, v_att, q_net, v_net, classifier)
Beispiel #27
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
Beispiel #28
0
def build_caq(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, 2 * num_hid,
                                  dataset.num_ans_candidates + 1, 0.5)
    return CAQModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
Beispiel #29
0
def build_baseline0_newatt_lstm_bidirection(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding2(300, num_hid, 1, True, 0.0, rnn_type='LSTM')
    v_att = NewAttention2(dataset.v_dim, q_emb.out_hid, num_hid)
    q_net = FCNet([q_emb.out_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    classifier = SimpleClassifier(num_hid, num_hid * 2,
                                  dataset.num_ans_candidates, 0.5)
    return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
Beispiel #30
0
 def __init__(self, image_emb_size, qst_emb_size, no_ans):
     super(VQA_Model, self).__init__()
     num_hid = 1024
     #emb_size = image_emb_size + qst_emb_size
     self.img_att = NewAttention(image_emb_size, qst_emb_size, qst_emb_size)
     # self.linear =  nn.Linear(emb_size, no_ans)
     self.q_net = FCNet([image_emb_size, num_hid])
     self.v_net = FCNet([qst_emb_size, num_hid])
     self.classifier = SimpleClassifier(num_hid, num_hid * 2, no_ans, 0.5)