def __init__(self, encoder, qword_embeddings, verb_lookup, n_roles, vocab_size, gpu_mode, mlp_hidden=512, embd_hidden=300): super(ImSituationHandler, self).__init__() self.encoder = encoder self.qword_embeddings = qword_embeddings self.verb_lookup = verb_lookup self.role_size = n_roles self.vocab_size = vocab_size self.gpu_mode = gpu_mode self.mlp_hidden = mlp_hidden self.embd_hidden = embd_hidden self.role_handler = RoleQHandler() self.role_predictor = RolePredictor() self.noun_q_net = FCNet([mlp_hidden, mlp_hidden]) self.noun_v_net = FCNet([mlp_hidden, mlp_hidden]) self.noun_classifier = SimpleClassifier( mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5) self.role_q_net = FCNet([mlp_hidden, mlp_hidden]) self.role_v_net = FCNet([mlp_hidden, mlp_hidden]) self.role_classifier = SimpleClassifier( mlp_hidden, 2 * mlp_hidden, self.role_size, 0.5)
def build_DD(dataset,num_hid): classifier_V = SimpleClassifier( 2048, num_hid * 2, dataset.num_ans_candidates, 0.5)#-------!!!!! classifier_Q = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return classifier_V,classifier_Q
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa'): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) v_att = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) q_att = weight_norm(nn.Linear(num_hid, 1), dim=None) w_emb2 = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) q_emb2 = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) v_att2 = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2) v_net2 = FCNet([dataset.v_dim, num_hid]) q_att2 = weight_norm(nn.Linear(num_hid, 1), dim=None) if task == 'vqa': b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifier(num_hid, num_hid * 2, 3, .5) classifier2 = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, .5) counter = Counter(objects) return BanModel(dataset, w_emb, q_emb, v_att, q_att, b_net, q_prj, c_prj, q_net, v_net, classifier, classifier2, counter, op, gamma, w_emb2, q_emb2, v_att2, v_net2, q_att2) elif task == 'flickr': return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden=512): super(BaseModel, self).__init__() self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.train_transform = tv.transforms.Compose([ tv.transforms.RandomRotation(10), tv.transforms.RandomResizedCrop(224), tv.transforms.RandomHorizontalFlip(), tv.transforms.ToTensor(), self.normalize, ]) self.dev_transform = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), self.normalize, ]) self.encoder = encoder self.gpu_mode = gpu_mode self.n_roles = self.encoder.get_num_roles() self.n_verbs = self.encoder.get_num_verbs() self.vocab_size = self.encoder.get_num_labels() self.max_role_count = self.encoder.get_max_role_count() self.n_role_q_vocab = len(self.encoder.question_words) self.conv = vgg16_modified() self.verb_lookup = nn.Embedding(self.n_verbs, embed_hidden) self.role_gcn = GCN(nfeat=mlp_hidden, nhid=mlp_hidden, dropout=0.5) self.role_classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden, self.n_roles + 1, 0.5) self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5) self.w_emb = nn.Embedding(self.n_role_q_vocab + 1, embed_hidden, padding_idx=self.n_role_q_vocab) self.q_emb = nn.LSTM(embed_hidden, mlp_hidden, batch_first=True, bidirectional=True) self.q_prep = FCNet([mlp_hidden, mlp_hidden]) self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden) self.verb_transform = nn.Linear(embed_hidden, mlp_hidden) self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden) self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.conv_hidden = self.conv.base_size() self.mlp_hidden = mlp_hidden self.embed_hidden = embed_hidden
def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden=512): super(BaseModel, self).__init__() self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.train_transform = tv.transforms.Compose([ tv.transforms.RandomRotation(10), tv.transforms.RandomResizedCrop(224), tv.transforms.RandomHorizontalFlip(), tv.transforms.ToTensor(), self.normalize, ]) self.dev_transform = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), self.normalize, ]) self.encoder = encoder self.gpu_mode = gpu_mode self.n_roles = self.encoder.get_num_roles() self.n_verbs = self.encoder.get_num_verbs() self.vocab_size = self.encoder.get_num_labels() self.agent_vocab_size = len(self.encoder.agent_label_list) self.place_vocab_size = len(self.encoder.place_label_list) self.max_role_count = self.encoder.get_max_role_count() self.n_role_q_vocab = len(self.encoder.question_words) self.conv = vgg16_modified() self.verb_lookup = nn.Embedding(self.n_verbs, embed_hidden) self.w_emb = nn.Embedding(self.n_role_q_vocab + 1, embed_hidden, padding_idx=self.n_role_q_vocab) self.q_emb1 = nn.LSTM(embed_hidden, mlp_hidden, batch_first=True, bidirectional=True) self.lstm_proj1 = nn.Linear(mlp_hidden * 2, mlp_hidden) self.q_emb2 = nn.LSTM(mlp_hidden, mlp_hidden, batch_first=True, bidirectional=True) self.lstm_proj2 = nn.Linear(mlp_hidden * 2, mlp_hidden) self.roles = TopDown(self.vocab_size) self.other_classifier = SimpleClassifier( mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5) self.agent_classifier = SimpleClassifier( mlp_hidden, 2 * mlp_hidden, self.agent_vocab_size, 0.5) self.place_classifier = SimpleClassifier( mlp_hidden, 2 * mlp_hidden, self.place_vocab_size, 0.5) self.conv_hidden = self.conv.base_size() self.mlp_hidden = mlp_hidden self.embed_hidden = embed_hidden self.dropout = nn.Dropout(0.3) self.num_steps = 3
def build_LL_newatt(dataset, num_hid):#---------------------------------!!!!!! w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier_LL = SimpleClassifier(#--------!!!!! num_hid*4, num_hid /8, 1, 0.5)#-----!!!!! classifier_All = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return LL4ALModel(w_emb, q_emb, v_att, q_net, v_net, classifier_LL,classifier_All)
def build_multimodal_newatt(dataset, num_hid):#---------------------------------!!!!!! w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier_V = SimpleClassifier( 2048, num_hid * 2, dataset.num_ans_candidates, 0.5)#-------!!!!! classifier_Q = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) classifier_All = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return MultiModalModel(w_emb, q_emb, v_att, q_net, v_net, classifier_V,classifier_Q,classifier_All)
def __init__(self, dataset, args): super(BAN_Model, self).__init__() self.args = args # init word embedding module, question embedding module, biAttention network, bi_residual network, and classifier self.w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.cat) self.q_emb = QuestionEmbedding(600 if args.cat else 300, args.hid_dim, 1, False, .0, args.rnn) # for close att+ resnet + classify self.close_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim, args.glimpse) self.close_resnet = BiResNet(args, dataset) self.close_classifier = SimpleClassifier(args.hid_dim, args.hid_dim * 2, dataset.num_close_candidates, args) # for open_att + resnet + classify self.open_att = BiAttention(dataset.v_dim, args.hid_dim, args.hid_dim, args.glimpse) self.open_resnet = BiResNet(args, dataset) self.open_classifier = SimpleClassifier(args.hid_dim, args.hid_dim * 2, dataset.num_open_candidates, args) # type attention: b * 1024 self.typeatt = typeAttention(dataset.dictionary.ntoken, './data/glove6b_init_300d.npy') # build and load pre-trained MAML model if args.maml: weight_path = args.data_dir + '/' + args.maml_model_path print('load initial weights MAML from: %s' % (weight_path)) self.maml = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn) # build and load pre-trained Auto-encoder model if args.autoencoder: self.ae = Auto_Encoder_Model() weight_path = args.data_dir + '/' + args.ae_model_path print('load initial weights DAE from: %s' % (weight_path)) self.ae.load_state_dict(torch.load(weight_path)) self.convert = nn.Linear(16384, 64) # Loading tfidf weighted embedding if hasattr(args, 'tfidf'): self.w_emb = tfidf_loading(args.tfidf, self.w_emb, args) # Loading the other net if args.other_model: pass
def build_CCB_model(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_ct_net = FCNet([q_emb.num_hid, num_hid]) q_cx_net = FCNet([q_emb.num_hid, num_hid]) v_ct_net = FCNet([dataset.v_dim, num_hid]) v_cx_net = FCNet([dataset.v_dim, num_hid]) classifier_fq = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) classifier_vq = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return CCB_Model(w_emb, q_emb, v_att, q_ct_net, q_cx_net, v_ct_net, classifier_fq, classifier_vq, v_cx_net)
def classify_species(s0, s1, ratio): sc = SimpleClassifier() iris, names = load_iris_data(s0, s1, ratio) sc.add_data(*iris.training) sc.train() valr = np.array([sc.classify(x) for x in iris.training[0]]) val = np.array([sc.classify(x) for x in iris.validation[0]]) return risk(valr, iris.training[1]), risk(val, iris.validation[1])
def __init__(self, opt): super(Model, self).__init__() self.dictionary = Dictionary.load_from_file(opt.dataroot + 'dictionary.pkl') num_hid = 128 activation = opt.activation dropG = opt.dropG dropW = opt.dropW dropout = opt.dropout dropL = opt.dropL norm = opt.norm dropC = opt.dropC self.opt = opt self.w_emb = WordEmbedding(opt.ntokens, emb_dim=300, dropout=dropW) self.w_emb.init_embedding(opt.dataroot + 'glove6b_init_300d.npy') self.q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, bidirect=False, dropout=dropG, rnn_type='GRU') self.q_net = FCNet([self.q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation) self.classifier = SimpleClassifier( in_dim=num_hid, hid_dim=num_hid // 2, out_dim=2, #opt.test_candi_ans_num, dropout=dropC, norm=norm, act=activation) self.normal = nn.BatchNorm1d(num_hid, affine=False)
def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden=512): super(role_module, self).__init__() self.encoder = encoder self.gpu_mode = gpu_mode self.n_roles = self.encoder.get_num_roles() self.n_verbs = self.encoder.get_num_verbs() self.vocab_size = self.encoder.get_num_labels() self.max_role_count = self.encoder.get_max_role_count() self.n_role_q_vocab = len(self.encoder.question_words) self.conv = vgg16_modified() self.verb_lookup = nn.Embedding(self.n_verbs, embed_hidden) self.w_emb = nn.Embedding(self.n_role_q_vocab + 1, embed_hidden, padding_idx=self.n_role_q_vocab) self.q_emb = nn.LSTM(embed_hidden, mlp_hidden, batch_first=True, bidirectional=True) self.q_prep = FCNet([mlp_hidden, mlp_hidden]) self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden) self.verb_transform = nn.Linear(embed_hidden, mlp_hidden) self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden) self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5) self.conv_hidden = self.conv.base_size() self.mlp_hidden = mlp_hidden self.embed_hidden = embed_hidden
def __init__(self, n_vocab, dim, embed_hidden=300, max_step=12, self_attention=False, memory_gate=False, classes=28, dropout=0.15): super().__init__() #print('embed vocab size :', n_vocab) self.embed = nn.Embedding(n_vocab + 1, embed_hidden) self.lstm = nn.LSTM(embed_hidden, dim, batch_first=True, bidirectional=True) self.lstm_proj = linear(dim * 2, dim) self.full_q_lstm_proj = linear(dim * 2, dim) self.mac = MACUnit(dim, max_step, self_attention, memory_gate, dropout) self.q_net = FCNet([dim, dim]) self.v_net = FCNet([dim, dim]) self.classifier = SimpleClassifier(dim, 2 * dim, classes, 0.5) self.max_step = max_step self.dim = dim self.reset()
def __init__(self, opt): super(Model, self).__init__() num_hid = opt.num_hid activation = opt.activation dropG = opt.dropG dropW = opt.dropW dropout = opt.dropout dropL = opt.dropL norm = opt.norm dropC = opt.dropC self.opt = opt self.w_emb = WordEmbedding(opt.ntokens, emb_dim=300, dropout=dropW) self.w_emb.init_embedding(opt.dataroot + 'glove6b_init_300d.npy') self.q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, bidirect=False, dropout=dropG, rnn_type='GRU') self.q_net = FCNet([self.q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation) self.gv_net = FCNet([opt.v_dim, num_hid], dropout=dropL, norm=norm, act=activation) self.gv_att_1 = Att_3(v_dim=opt.v_dim, q_dim=self.q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) self.gv_att_2 = Att_3(v_dim=opt.v_dim, q_dim=self.q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) self.classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=opt.ans_dim, dropout=dropC, norm=norm, act=activation) self.normal = nn.BatchNorm1d(num_hid,affine=False)
def __init__(self, args): super(_netG, self).__init__() self.ninp = args.ninp self.nhid = args.nhid self.nlayers = args.nlayers self.dropout = args.dropout self.rnn = getattr(nn, 'LSTM')(self.ninp, self.nhid, self.nlayers, bidirectional=False, dropout=self.dropout, batch_first=True) self.rnn_type = 'LSTM' self.decoder = SimpleClassifier(self.nhid * 2, self.nhid * 4, args.vocab_size, self.dropout) self.d = args.dropout self.beta = 3 self.vocab_size = args.vocab_size # self.init_weights() self.w_q = nn.Linear(self.nhid * 2, self.nhid) self.ans_q = nn.Linear(self.nhid, self.nhid) self.Wa_q = nn.Linear(self.nhid, 1) self.w_h = nn.Linear(self.nhid * 2, self.nhid) self.ans_h = nn.Linear(self.nhid, self.nhid) self.Wa_h = nn.Linear(self.nhid, 1) self.w_i = nn.Linear(self.nhid * 2, self.nhid) self.ans_i = nn.Linear(self.nhid, self.nhid) self.Wa_i = nn.Linear(self.nhid, 1) self.concat = nn.Linear(self.nhid * 3, self.nhid)
def build_SAN(dataset, args): # init word embedding module, question embedding module, and Attention network w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0, args.op) q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid, 1, False, 0.0, args.rnn) v_att = StackedAttention(args.num_stacks, dataset.v_dim, args.num_hid, args.num_hid, dataset.num_ans_candidates, args.dropout) # build and load pre-trained MAML model if args.maml: weight_path = args.RAD_dir + '/' + args.maml_model_path print('load initial weights MAML from: %s' % (weight_path)) maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn) # build and load pre-trained Auto-encoder model if args.autoencoder: ae_v_emb = Auto_Encoder_Model() weight_path = args.RAD_dir + '/' + args.ae_model_path print('load initial weights DAE from: %s' % (weight_path)) ae_v_emb.load_state_dict(torch.load(weight_path)) # Loading tfidf weighted embedding if hasattr(args, 'tfidf'): w_emb = tfidf_loading(args.tfidf, w_emb, args) # init classifier classifier = SimpleClassifier(args.num_hid, 2 * args.num_hid, dataset.num_ans_candidates, args) # contruct VQA model and return if args.maml and args.autoencoder: return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb, ae_v_emb) elif args.maml: return SAN_Model(w_emb, q_emb, v_att, classifier, args, maml_v_emb, None) elif args.autoencoder: return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, ae_v_emb) return SAN_Model(w_emb, q_emb, v_att, classifier, args, None, None)
def __init__(self, encoder, gpu_mode, role_lookup, ans_lookup, q_word_embeddings, num_roles, num_labels, mlp_hidden, embd_hidden, vqa_model): super(RoleNode, self).__init__() self.encoder = encoder self.gpu_mode = gpu_mode self.role_lookup = role_lookup self.ans_lookup = ans_lookup self.q_word_embeddings = q_word_embeddings self.num_roles = num_roles self.num_labels = num_labels self.mlp_hidden = mlp_hidden self.embd_hidden = embd_hidden self.vqa_model = vqa_model self.verb_project = nn.Linear(embd_hidden, mlp_hidden) self.label_classifier = SimpleClassifier( mlp_hidden, 2 * mlp_hidden, self.num_labels + 1, 0.5)
def __init__(self, encoder, embed_hidden=300, mlp_hidden=512, time_steps=3): super(TopDownWithContext, self).__init__() self.encoder = encoder self.n_verbs = self.encoder.get_num_verbs() self.vocab_size = self.encoder.get_num_labels() self.max_role_count = self.encoder.get_max_role_count() self.n_role_q_vocab = len(self.encoder.question_words) self.w_emb = nn.Embedding(self.n_role_q_vocab + 1, embed_hidden, padding_idx=self.n_role_q_vocab) self.q_emb = nn.LSTM(embed_hidden, mlp_hidden, batch_first=True, bidirectional=True) self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden) self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden) self.ctx_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden) self.context = FCNet([mlp_hidden*2, mlp_hidden]) #self.role_weight = RoleWeightAttention(mlp_hidden, mlp_hidden, mlp_hidden) self.detailedq = FCNet([mlp_hidden*2, mlp_hidden]) self.concat = FCNet([mlp_hidden*2, mlp_hidden]) self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.classifier = SimpleClassifier( mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5) self.time_steps = time_steps
def __init__(self, encoder, role_lookup, label_lookup, qword_embeddings, vocab_size, gpu_mode, mlp_hidden=512, emd_hidden=300): super(ImSituationHandler, self).__init__() self.encoder = encoder self.role_lookup = role_lookup self.label_lookup = label_lookup self.qword_embeddings = qword_embeddings self.vocab_size = vocab_size self.gpu_mode = gpu_mode self.mlp_hidden = mlp_hidden self.emd_hidden = emd_hidden self.q_emb = nn.LSTM(self.emd_hidden, self.mlp_hidden, batch_first=True, bidirectional=True) self.lstm_word_proj = nn.Linear(self.mlp_hidden * 2, self.mlp_hidden) self.role_handler = RoleQHandler() self.q_net = FCNet([self.mlp_hidden * 2, self.mlp_hidden]) self.v_net = FCNet([self.mlp_hidden, self.mlp_hidden]) self.classifier = SimpleClassifier(self.mlp_hidden, 2 * self.mlp_hidden, self.vocab_size, 0.5)
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa', use_counter=True): #w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) #q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) w_emb = AlbertTokenizer.from_pretrained('albert-large-v2') q_emb = AlbertModel.from_pretrained('albert-large-v2') params_set = set() for param in q_emb.parameters(): params_set.add(param) param.requires_grad = False v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma) if task == 'vqa': b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, .5) counter = Counter(objects) if use_counter else None return BanModel(dataset, params_set, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, op, gamma) elif task == 'flickr': return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
def build_ban(num_token, v_dim, num_hid, num_ans, op='', gamma=4, reasoning=False): w_emb = WordEmbedding(num_token, 300, .0, op) q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) if not reasoning: v_att = BiAttention(v_dim, num_hid, num_hid, gamma) else: v_att = BiAttention(v_dim, num_hid, num_hid, 1) # constructing the model b_net = [] q_prj = [] c_prj = [] objects = 36 # minimum number of boxes, originally 10 for i in range(gamma): b_net.append(BCNet(v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifier(num_hid, num_hid * 2, num_ans, .5) counter = Counter(objects) if not reasoning: return BanModel(w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, op, gamma, num_hid) else: return BanModel_Reasoning(w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, op, gamma, num_hid)
def __init__(self, max_role_count, vocab_size, gpu_mode, embed_hidden=300, mlp_hidden=512): super(TopDown, self).__init__() self.vocab_size = vocab_size self.max_role_count = max_role_count self.gpu_mode = gpu_mode '''self.q_emb = nn.LSTM(embed_hidden, mlp_hidden, batch_first=True, bidirectional=True) self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden)''' self.q_proj = nn.Sequential( nn.Linear(embed_hidden * 2, mlp_hidden), nn.ReLU(), ) self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden) self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5) self.mlp_hidden = mlp_hidden self.dropout = nn.Dropout(0.2)
def build_model_APD(dataset, num_hid, dropout, norm, activation, dropL, dropG, dropW, dropC): w_emb = WordEmbedding(dataset.dictionary.ntoken, emb_dim=300, dropout=dropW) q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, bidirect=False, dropout=dropG, rnn_type='GRU') v_att = Att_PD(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) q_net = FCNet([q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation) v_net = FCNet([dataset.v_dim, num_hid], dropout=dropL, norm=norm, act=activation) classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=dataset.num_ans_candidates, dropout=dropC, norm=norm, act=activation) return Model(w_emb, q_emb, v_att, q_net, v_net, classifier)
def __init__(self, encoder, qword_embeddings, vocab_size, gpu_mode, mlp_hidden=512): super(ImSituationHandler, self).__init__() self.encoder = encoder self.qword_embeddings = qword_embeddings self.vocab_size = vocab_size self.gpu_mode = gpu_mode self.role_handler = RoleQHandler() self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5) self.mlp_hidden = mlp_hidden self.g = nn.Sequential( nn.Linear(mlp_hidden * 2, mlp_hidden), nn.ReLU(), nn.Linear(mlp_hidden, mlp_hidden), nn.ReLU(), nn.Linear(mlp_hidden, mlp_hidden), nn.ReLU(), nn.Linear(mlp_hidden, mlp_hidden), nn.ReLU(), ) self.f = nn.Sequential(nn.Linear(mlp_hidden, mlp_hidden), nn.ReLU(), nn.Linear(mlp_hidden, mlp_hidden))
def __init__(self, encoder, gpu_mode, conv_hidden=24, embed_hidden=300, lstm_hidden=300, mlp_hidden=512): super(BaseModel, self).__init__() self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.train_transform = tv.transforms.Compose([ tv.transforms.RandomRotation(10), tv.transforms.RandomResizedCrop(224), tv.transforms.RandomHorizontalFlip(), tv.transforms.ToTensor(), self.normalize, ]) self.dev_transform = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), self.normalize, ]) self.encoder = encoder self.gpu_mode = gpu_mode self.mlp_hidden = mlp_hidden #self.vocab_size = self.encoder.get_num_labels() self.n_verbs = self.encoder.get_num_verbs() self.vocab_size = self.encoder.get_num_labels() self.q_word_count = len(self.encoder.question_words) self.conv_agent = vgg16_modified() self.conv_verb = vgg16_modified_feat() self.agent = nn.Sequential( nn.Linear(mlp_hidden * 8, mlp_hidden * 2), nn.BatchNorm1d(mlp_hidden * 2), nn.ReLU(), nn.Dropout(0.5), nn.Linear(mlp_hidden * 2, self.vocab_size), ) self.w_emb = nn.Embedding(self.q_word_count + 1, embed_hidden, padding_idx=self.q_word_count) self.verb = TopDown() self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden, self.n_verbs, 0.5) self.logits = {}
def build_baseline0(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([num_hid, num_hid], 0.0, "weight", "ReLU") v_net = FCNet([dataset.v_dim, num_hid], 0.0, "weight", "ReLU") classifier = SimpleClassifier(num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return BaseModel0(w_emb, q_emb, v_att, q_net, v_net, classifier)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
def build_caq(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, 2 * num_hid, dataset.num_ans_candidates + 1, 0.5) return CAQModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
def build_baseline0_newatt_lstm_bidirection(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding2(300, num_hid, 1, True, 0.0, rnn_type='LSTM') v_att = NewAttention2(dataset.v_dim, q_emb.out_hid, num_hid) q_net = FCNet([q_emb.out_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
def __init__(self, image_emb_size, qst_emb_size, no_ans): super(VQA_Model, self).__init__() num_hid = 1024 #emb_size = image_emb_size + qst_emb_size self.img_att = NewAttention(image_emb_size, qst_emb_size, qst_emb_size) # self.linear = nn.Linear(emb_size, no_ans) self.q_net = FCNet([image_emb_size, num_hid]) self.v_net = FCNet([qst_emb_size, num_hid]) self.classifier = SimpleClassifier(num_hid, num_hid * 2, no_ans, 0.5)