def build_baseline0(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att1 = Attention(dataset.v_dim, q_emb.num_hid, num_hid) v_att2 = Attention(dataset.v_dim, num_hid, num_hid) q_net = FCNet([num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return SANModel2(w_emb, q_emb, v_att1, v_att2, q_net, v_net, classifier)
def build_VQE_newatt_2(dataset, num_hid, att_dim, dec_dim): w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) generator = STDecoder( dataset.v_dim, num_hid, 300, dec_dim,\ dataset.explanation_dictionary.ntoken, 1, 0.5) return VQAE2(w_emb, q_emb, v_att, q_net, v_net, None, generator, None, None)
def __init__(self, v_dim, q_dim, num_hid, com=False): self.q_net = FCNet([q_dim, num_hid]) self.v_net = FCNet([v_dim, num_hid]) self.com = com if com: layers = [ nn.Dropout(0.2, inplace=True), weight_norm(nn.Linear(num_hid, num_hid), dim=None) #, #nn.ReLU() ] self.f = nn.Sequential(*layers)
def __init__(self, vocab_size, embed_hidden=300, mlp_hidden=512): super(TopDown, self).__init__() self.vocab_size = vocab_size self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden) self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5) self.mlp_hidden = mlp_hidden
def build_LL_newatt(dataset, num_hid):#---------------------------------!!!!!! w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier_LL = SimpleClassifier(#--------!!!!! num_hid*4, num_hid /8, 1, 0.5)#-----!!!!! classifier_All = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return LL4ALModel(w_emb, q_emb, v_att, q_net, v_net, classifier_LL,classifier_All)
def attention_mfh(dataset, num_hid, dropout, norm, activation, drop_L, drop_G, drop_W, drop_C, mfb_out_dim, bidirect_val=False): w_emb = WordEmbedding(dataset.dictionary.ntoken, emb_dim=300, dropout=drop_W) q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, bidirect=bidirect_val, dropout=drop_G, rnn_type='GRU') v_att = Base_Att(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, bidirect=bidirect_val, norm=norm, act=activation) if (bidirect_val is False): q_net = FCNet([num_hid, num_hid], dropout=drop_L, norm=norm, act=activation) #v_net = FCNet([dataset.v_dim, num_hid], dropout= drop_L, norm= norm, act= activation) else: q_net = FCNet([2 * num_hid, num_hid], dropout=drop_L, norm=norm, act=activation) v_net = FCNet([dataset.v_dim, num_hid], dropout=drop_L, norm=norm, act=activation) mfh_net = mfh_baseline(QUEST_EMBED=num_hid, VIS_EMBED=num_hid, MFB_OUT_DIM=mfb_out_dim) classifier = SimpleClassifier(in_dim=2 * mfb_out_dim, hid_dim=2 * num_hid, out_dim=dataset.num_ans_candidates, dropout=drop_C, norm=norm, act=activation) return (VQA_Model_MFH(w_emb, q_emb, v_att, q_net, v_net, mfh_net, classifier))
def build_BAN(dataset, args, priotize_using_counter=False): # init word embedding module, question embedding module, and Attention network w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.op) q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid, 1, False, .0, args.rnn) v_att = BiAttention(dataset.v_dim, args.num_hid, args.num_hid, args.gamma) # build and load pre-trained MAML model if args.maml: weight_path = args.RAD_dir + '/' + args.maml_model_path print('load initial weights MAML from: %s' % (weight_path)) maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn) # build and load pre-trained Auto-encoder model if args.autoencoder: ae_v_emb = Auto_Encoder_Model() weight_path = args.RAD_dir + '/' + args.ae_model_path print('load initial weights DAE from: %s' % (weight_path)) ae_v_emb.load_state_dict(torch.load(weight_path)) # Loading tfidf weighted embedding if hasattr(args, 'tfidf'): w_emb = tfidf_loading(args.tfidf, w_emb, args) # Optional module: counter for BAN use_counter = args.use_counter if priotize_using_counter is None else priotize_using_counter if use_counter or priotize_using_counter: objects = 10 # minimum number of boxes if use_counter or priotize_using_counter: counter = Counter(objects) else: counter = None # init BAN residual network b_net = [] q_prj = [] c_prj = [] for i in range(args.gamma): b_net.append( BCNet(dataset.v_dim, args.num_hid, args.num_hid, None, k=1)) q_prj.append(FCNet([args.num_hid, args.num_hid], '', .2)) if use_counter or priotize_using_counter: c_prj.append(FCNet([objects + 1, args.num_hid], 'ReLU', .0)) # init classifier classifier = SimpleClassifier(args.num_hid, args.num_hid * 2, dataset.num_ans_candidates, args) # contruct VQA model and return if args.maml and args.autoencoder: return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, maml_v_emb, ae_v_emb) elif args.maml: return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, maml_v_emb, None) elif args.autoencoder: return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, None, ae_v_emb) return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, None, None)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) c_1 = MLP(input_dim=1024, dimensions=[1024, 1024, dataset.num_ans_candidates]) c_2 = nn.Linear(dataset.num_ans_candidates, dataset.num_ans_candidates) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, c_1, c_2)
def build_baseline1(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) w_emb2 = WordEmbedding(dataset.dictionary.ntoken, num_hid, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) lstm = nn.LSTM(num_hid, num_hid, 1, batch_first=True) classifier = SimpleClassifier(num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, lstm, w_emb2)
def build_baseline0(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb1 = QuestionEmbedding(300, num_hid, 1, False, 0.0) q_emb2 = QuestionEmbedding(300) v_att = StackAttention(num_hid, num_hid, num_hid) q_net = FCNet([num_hid, num_hid]) v_net = FCNet([num_hid, num_hid]) linear = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return AttentionModel(w_emb, q_emb1, v_att, q_net, v_net, classifier, linear)
def __init__( self, encoder, gpu_mode, conv_hidden=24, embed_hidden=300, lstm_hidden=300, mlp_hidden=512 ): super(BaseModel, self).__init__() self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.train_transform = tv.transforms.Compose([ tv.transforms.RandomRotation(10), tv.transforms.RandomResizedCrop(224), tv.transforms.RandomHorizontalFlip(), tv.transforms.ToTensor(), self.normalize, ]) self.dev_transform = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), self.normalize, ]) self.encoder = encoder self.gpu_mode = gpu_mode #self.vocab_size = self.encoder.get_num_labels() self.n_verbs = self.encoder.get_num_verbs() self.vocab_size = self.encoder.get_num_labels() #self.agent_label_lookup = nn.Embedding(self.vocab_size, embed_hidden) self.conv_agent = vgg16_modified() self.conv_verb = vgg16_modified_feat() self.q_word_count = len(self.encoder.question_words) self.w_emb = nn.Embedding(self.q_word_count, embed_hidden) self.word_att = BigAttention(mlp_hidden, embed_hidden, mlp_hidden) self.vqa_model = TopDown() self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.classifier = SimpleClassifier( mlp_hidden, 2 * mlp_hidden, self.n_verbs, 0.5)
def build_vqae_newatt(dataset, num_hid, att_dim, dec_dim): w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) generator = STDecoder( dataset.v_dim, num_hid, 300, dec_dim,\ dataset.explanation_dictionary.ntoken, 1, 0.5) return VQAE(w_emb, q_emb, v_att, q_net, v_net, classifier, generator)
def build_baseline4(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = AttQuestionEmbedding(300, 1024, 1, 0, 512, 1, 2, 0) v_att = doubel_project_attention(dataset.v_dim, q_emb.num_hid, q_emb.num_hid, 0.2) q_net = FCNet([num_hid, num_hid], 0.0, "weight", "ReLU") v_net = FCNet([dataset.v_dim, num_hid], 0.0, "weight", "ReLU") rn = RN4(dataset.v_dim, num_hid, "weight", "ReLU", 0.0) sfu = SFU(dataset.v_dim, dataset.v_dim) classifier = SimpleClassifier(num_hid, 5000, dataset.num_ans_candidates, 0.5) return BaseModel1(w_emb, q_emb, v_att, q_net, v_net, rn, sfu, classifier)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) c_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) c_net = FCNet([c_emb.num_hid, num_hid]) classifier = SimpleClassifier(2 * num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, c_emb, v_att, q_net, v_net, c_net, classifier)
def __init__(self, v_dim, q_dim, num_hid, norm, act, dropout=0.0): super(Att_PD, self).__init__() norm_layer = get_norm(norm) self.nonlinear = FCNet([v_dim + q_dim, num_hid, num_hid], dropout=dropout, norm=norm, act=act) self.nonlinear_gate = FCNet([v_dim + q_dim, num_hid, num_hid], dropout=dropout, norm=norm, act='Sigmoid') self.linear = norm_layer(nn.Linear(num_hid, 1), dim=None)
def __init__(self, v_dim, q_dim, num_hid, norm, act, dropout=0.0): super(Att_2, self).__init__() norm_layer = get_norm(norm) self.v_proj = FCNet([v_dim, num_hid], dropout=dropout, norm=norm, act=act) self.q_proj = FCNet([q_dim, num_hid], dropout=dropout, norm=norm, act=act) self.linear = norm_layer(nn.Linear(q_dim, 1), dim=None)
def attention_baseline(dataset, num_hid, dropout, norm, activation, drop_L, drop_G, drop_W, drop_C, bidirect_val=False): print('Here in the attention baseline') w_emb = WordEmbedding(dataset.dictionary.ntoken, emb_dim=300, dropout=drop_W) q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, bidirect=bidirect_val, dropout=drop_G, rnn_type='GRU') #bert_emb=BertEmbedding(in_dim=7168,num_hid=num_hid) v_att = Base_Att(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, bidirect=bidirect_val, norm=norm, act=activation) if (bidirect_val is False): q_net = FCNet([num_hid, num_hid], dropout=drop_L, norm=norm, act=activation) #v_net = FCNet([dataset.v_dim, num_hid], dropout= drop_L, norm= norm, act= activation) else: q_net = FCNet([2 * num_hid, num_hid], dropout=drop_L, norm=norm, act=activation) v_net = FCNet([dataset.v_dim, num_hid], dropout=drop_L, norm=norm, act=activation) classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=dataset.num_ans_candidates, dropout=drop_C, norm=norm, act=activation) return (VQA_Model(w_emb, q_emb, v_att, q_net, v_net, classifier))
def __init__(self, embed_hidden=300, mlp_hidden=512): super(TopDown, self).__init__() self.q_emb = nn.LSTM(embed_hidden + mlp_hidden, mlp_hidden, batch_first=True, bidirectional=True) self.q_prep = FCNet([mlp_hidden, mlp_hidden]) self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden) self.verb_transform = nn.Linear(embed_hidden, mlp_hidden) self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden) self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden])
def build_multimodal_newatt(dataset, num_hid):#---------------------------------!!!!!! w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier_V = SimpleClassifier( 2048, num_hid * 2, dataset.num_ans_candidates, 0.5)#-------!!!!! classifier_Q = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) classifier_All = SimpleClassifier( num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return MultiModalModel(w_emb, q_emb, v_att, q_net, v_net, classifier_V,classifier_Q,classifier_All)
def build_baseline(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att1 = Att_3(dataset.v_dim, num_hid, num_hid, "weight", "ReLU") v_att2 = Att_3(dataset.v_dim, num_hid, num_hid, "weight", "ReLU") q_net = FCNet([num_hid, num_hid], 0.0, "weight", "ReLU") v_net = FCNet([dataset.v_dim, num_hid], 0.0, "weight", "ReLU") rn = RN(dataset.v_dim, "weight", "ReLU", 0.0) sfu = SFU(dataset.v_dim, dataset.v_dim) classifier = SimpleClassifier(num_hid, 5000, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att1, v_att2, q_net, v_net, rn, sfu, classifier)
def __init__(self, vocab_size, embed_hidden=300, mlp_hidden=512): super(TopDown, self).__init__() self.vocab_size = vocab_size self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden) self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden])
def __init__(self, image_feat_dim, txt_rnn_embeding_dim, hidden_size, dropout=0.2): super(project_attention, self).__init__() self.image_feat_dim = image_feat_dim self.txt_embeding_dim = txt_rnn_embeding_dim self.Fa_image = FCNet([image_feat_dim, hidden_size], 0.0, "weight", "LeakyReLU") self.Fa_txt = FCNet([txt_rnn_embeding_dim, hidden_size], 0.0, "weight", "LeakyReLU") self.dropout = nn.Dropout(dropout) self.lc = nn.Linear(hidden_size, 1)
def build_caq_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid // 2]) v_net = FCNet([dataset.v_dim, num_hid // 2]) updated_query_composer = FCNet([num_hid + num_hid // 2, num_hid]) neighbour_attention = MultiHeadedAttention(4, num_hid // 2, dropout=0.1) Dropout_C = nn.Dropout(0.1) classifier = SimpleClassifier(num_hid // 2, num_hid * 2, dataset.num_ans_candidates + 1, 0.5) return CAQModel(w_emb, q_emb, v_att, q_net, v_net, updated_query_composer, neighbour_attention, Dropout_C, classifier, dataset)
def build_CCB_model(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_ct_net = FCNet([q_emb.num_hid, num_hid]) q_cx_net = FCNet([q_emb.num_hid, num_hid]) v_ct_net = FCNet([dataset.v_dim, num_hid]) v_cx_net = FCNet([dataset.v_dim, num_hid]) classifier_fq = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) classifier_vq = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return CCB_Model(w_emb, q_emb, v_att, q_ct_net, q_cx_net, v_ct_net, classifier_fq, classifier_vq, v_cx_net)
def __init__(self, v_dim, q_dim, num_hid, output_channel=36, kernel_size=1, stride=1, \ instance_norm=0, padding_type='same', l2_norm=0, concat=1, leaky_relu=None, last_no_relu=None, \ num_conv_layer=1, conv_norm=0, softmax=0, dropout=0.2): super(SigSoftAttention, self).__init__() self.output_channel = output_channel self.kernel_size = kernel_size self.stride = stride self.padding_type = padding_type self.l2_norm = l2_norm self.concat = concat self.softmax = softmax self.num_conv_layer = num_conv_layer if self.num_conv_layer == 2: conv_hid_dim = 256 self.v_proj = FCNet([v_dim, num_hid], leaky_relu, last_no_relu) self.q_proj = FCNet([q_dim, num_hid], leaky_relu, last_no_relu) self.dropout = nn.Dropout(dropout) assert stride==1 if padding_type == 'same': padding_num = (self.kernel_size-1)/2 elif padding_type == 'valid': padding_num = 0 self.zero_padding = nn.ConstantPad2d((self.kernel_size-1)/2, -1) # there is a (x+1)/2 operation later else: raise ValueError if self.concat: if self.num_conv_layer == 2: self.conv2 = conv_weight_norm(nn.Conv2d(2*num_hid, conv_hid_dim, self.kernel_size, self.stride, padding=padding_num, bias=True), conv_norm=conv_norm) self.conv2_relu = nn.LeakyReLU(negative_slope=0.3) self.conv1 = conv_weight_norm(nn.Conv2d(conv_hid_dim, self.output_channel, self.kernel_size, self.stride, padding=padding_num, bias=True), conv_norm=conv_norm) else: self.conv1 = conv_weight_norm(nn.Conv2d(2*num_hid, self.output_channel, self.kernel_size, self.stride, padding=padding_num, bias=True), conv_norm=conv_norm) else: if self.num_conv_layer == 2: self.conv2 = conv_weight_norm(nn.Conv2d(num_hid, conv_hid_dim, self.kernel_size, self.stride, padding=padding_num, bias=True), conv_norm=conv_norm) self.conv2_relu = nn.LeakyReLU(negative_slope=0.3) self.conv1 = conv_weight_norm(nn.Conv2d(conv_hid_dim, self.output_channel, self.kernel_size, self.stride, padding=padding_num, bias=True), conv_norm=conv_norm) else: self.conv1 = conv_weight_norm(nn.Conv2d(num_hid, self.output_channel, self.kernel_size, self.stride, padding=padding_num, bias=True), conv_norm=conv_norm) self.sigmoid = nn.Sigmoid() self.instance_norm = instance_norm if self.instance_norm: if self.num_conv_layer == 2: self.conv2_in = nn.InstanceNorm2d(conv_hid_dim) self.conv1_in = nn.InstanceNorm2d(self.output_channel)
def __init__(self, encoder, gpu_mode, embed_hidden=300, mlp_hidden=512): super(BaseModel, self).__init__() self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.train_transform = tv.transforms.Compose([ tv.transforms.RandomRotation(10), tv.transforms.RandomResizedCrop(224), tv.transforms.RandomHorizontalFlip(), tv.transforms.ToTensor(), self.normalize, ]) self.dev_transform = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), self.normalize, ]) self.encoder = encoder self.gpu_mode = gpu_mode self.n_roles = self.encoder.get_num_roles() self.n_verbs = self.encoder.get_num_verbs() self.vocab_size = self.encoder.get_num_labels() self.max_role_count = self.encoder.get_max_role_count() self.n_role_q_vocab = len(self.encoder.question_words) self.conv = vgg16_modified() self.verb_lookup = nn.Embedding(self.n_verbs, embed_hidden) self.w_emb = nn.Embedding(self.n_role_q_vocab + 1, embed_hidden, padding_idx=self.n_role_q_vocab) self.q_emb = nn.LSTM(embed_hidden, mlp_hidden, batch_first=True, bidirectional=True) self.q_prep = FCNet([mlp_hidden, mlp_hidden]) self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden) self.verb_transform = nn.Linear(embed_hidden, mlp_hidden) #self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden) self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5) self.conv_hidden = self.conv.base_size() self.mlp_hidden = mlp_hidden self.embed_hidden = embed_hidden
def build_stackatt(dataset, num_hid, args): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4) q_emb = QuestionEmbedding(300, num_hid, args.rnn_layer, False, 0.4) v_att = NewAttention(dataset.v_dim, 2048 + q_emb.num_hid, num_hid, 0.2) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) query_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) model = BaseModelStackAtt(w_emb, q_emb, v_att, q_net, v_net, query_net, classifier, args) return model
def __init__(self, c_dim, num_hid, q_dim, nlayers, bidirect, dropout, rnn_type='LSTM', v_dim=2048): """Module for question embedding """ super(CaptionQuestionImageRNN, self).__init__() assert rnn_type == 'LSTM' or rnn_type == 'GRU' rnn_cls = nn.LSTM if rnn_type == 'LSTM' else nn.GRU norm_layer = get_norm('weight') self.rnn_att = rnn_cls(c_dim, num_hid, nlayers, bidirectional=bidirect, dropout=dropout, batch_first=True) self.rnn_c = rnn_cls(c_dim, num_hid, nlayers, bidirectional=bidirect, dropout=dropout, batch_first=True) self.q_emb_for_c = FCNet([q_dim, num_hid], dropout=0.2, norm='weight', act='LeakyReLU') self.att_logits = norm_layer(nn.Linear(num_hid, 1), dim=None) self.v_emb_for_c = FCNet([v_dim, num_hid], dropout=0.2, norm='weight', act='LeakyReLU') self.v_att_logits = norm_layer(nn.Linear(num_hid, 1), dim=None) self.Sig = nn.Sigmoid() self.c_dim = c_dim self.q_dim = q_dim self.num_hid = num_hid self.nlayers = nlayers self.ndirections = int(bidirect) + 1 self.rnn_type = rnn_type self.v_dim = v_dim
def build_lstm_vqa(dataset, num_hid, att_dim, dec_dim): w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) generator = SATDecoder( dataset.v_dim, num_hid, 300, att_dim, dec_dim,\ dataset.explanation_dictionary.ntoken, 1, 0.5) #att_emb = nn.GRU(dataset.v_dim, num_hid, 1, False, batch_first=True) att_emb = nn.GRUCell(dataset.v_dim, num_hid) classifier = SimpleClassifier(num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return LSTM_VQA(w_emb, q_emb, v_att, q_net, v_net, generator, att_emb, classifier)
def build_baseline0_gcn(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding_all(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net0 = FCNet([q_emb.num_hid, num_hid]) v_net0 = FCNet([dataset.v_dim, num_hid]) gcn = FCNet([num_hid, num_hid]) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return GraphModel(w_emb, q_emb, v_att, q_net0, v_net0, gcn, q_net, v_net, classifier)