def build_ban_foil(dataset, num_hid, num_ans_candidates, op='', gamma=4): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma) b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifierFoil(num_hid, 64, num_ans_candidates) counter = Counter(objects) return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, op, gamma)
def __init__(self, vocab_size, embed_hidden=300, mlp_hidden=512): super(TopDown, self).__init__() self.vocab_size = vocab_size self.q_emb = nn.LSTM(embed_hidden, mlp_hidden, batch_first=True, bidirectional=True) self.q_prep = FCNet([mlp_hidden, mlp_hidden]) self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden) self.verb_transform = nn.Linear(embed_hidden, mlp_hidden) self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden) self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden, self.vocab_size, 0.5)
def __init__(self, opt): super(UpDn, self).__init__() num_hid = opt.num_hid activation = opt.activation dropG = opt.dropG dropW = opt.dropW dropout = opt.dropout dropL = opt.dropL norm = opt.norm dropC = opt.dropC self.opt = opt print(f"ntokens {opt.ntokens}") self.w_emb = WordEmbedding(opt.ntokens, emb_dim=300, dropout=dropW) self.w_emb.init_embedding(f'{opt.data_dir}/glove6b_init_300d.npy') # self.q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, # bidirect=False, dropout=dropG, rnn_type='GRU') self.q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid) self.q_net = FCNet([self.q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation) self.gv_net = FCNet([2048, num_hid], dropout=dropL, norm=norm, act=activation) self.gv_att_1 = Att_3(v_dim=2048, q_dim=self.q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) self.gv_att_2 = Att_3(v_dim=2048, q_dim=self.q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) self.classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=3129, dropout=dropC, norm=norm, act=activation)
def build_stackdualatt(dataset, num_hid, args): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4) q_emb = QuestionEmbedding(300, num_hid, args.rnn_layer, False, 0.4) v_att1 = DualAttention(dataset.v_dim, q_emb.num_hid, num_hid, 0.2) v_att2 = DualAttention(dataset.v_dim, q_emb.num_hid + num_hid, num_hid, 0.2) v_att3 = DualAttention(dataset.v_dim, q_emb.num_hid + num_hid, num_hid, 0.2) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) query_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) model = BaseModelStackAtt(w_emb, q_emb, (v_att1, v_att2, v_att3), q_net, v_net, query_net, classifier, args) return model
def build_baseline0_newatt(dataset, num_hid, reconstruction, size=64, dropout_hid=0.0, gamma_r=0.0, adv_mode="wgan", logger=None): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid, dataset.v_dim, reconstruction, size, dropout_hid, gamma_r, adv_mode, logger)
def build_model_A3x2_h(dataset, num_hid, dropout, norm, activation, dropL, dropG, dropW, dropC): w_emb = WordEmbedding(dataset.dictionary.ntoken, emb_dim=300, dropout=dropW) q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, bidirect=False, dropout=dropG, rnn_type='GRU') print('v_dim: %d\tq_dim: %d\tnum_hid: %d\t num ans candidates: %d' % (dataset.v_dim, q_emb.num_hid, num_hid, dataset.num_ans_candidates)) v_att_1 = Att_3(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) v_att_2 = Att_3(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) q_net = FCNet([q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation) v_net = FCNet([dataset.v_dim, num_hid], dropout=dropL, norm=norm, act=activation) h_net = HNet([1280, 100, 100], [1280, 1280]) classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=dataset.num_ans_candidates, dropout=dropC, norm=norm, act=activation) return Model_h(w_emb, q_emb, v_att_1, v_att_2, q_net, v_net, h_net, classifier)
def __init__(self, dim, max_step=12, self_attention=False, memory_gate=False, classes=28, dropout=0.15): super().__init__() self.q_trasform = FCNet([300, dim]) self.v_trasform = FCNet([dim, dim]) self.mac = MACUnit(dim, max_step, self_attention, memory_gate, dropout) '''self.classifier = nn.Sequential(linear(dim * 2, dim), nn.ELU(), nn.Dropout(0.5), linear(dim, classes))''' self.classifier = SimpleClassifier(dim, 2 * dim, classes, 0.5) self.max_step = max_step self.dim = dim
def build_baseline0_newatt2(args, num_hid): w_emb = WordEmbedding(args.vocab_size, args.ninp, 0.0) q_emb = QuestionEmbedding2(args.ninp, num_hid, args.nlayers, True, 0.0) h_emb = QuestionEmbedding2(args.ninp, num_hid, args.nlayers, True, 0.0) v_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) h_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) qih_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) qhi_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) q_net = FCNet([q_emb.num_hid*2, num_hid*2]) v_net = FCNet([args.nhid*2, num_hid*2]) h_net = FCNet([args.nhid*2, num_hid*2]) qih_net = FCNet([args.nhid*2, num_hid*2]) qhi_net = FCNet([args.nhid*2, num_hid*2]) qhih_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) qihi_att = NewAttention(args.nhid*2, q_emb.num_hid*2, num_hid*2) decoder = netG(args) return BaseModel2(w_emb, q_emb, h_emb, v_att, h_att, q_net, v_net, h_net, qih_att, qhi_att, qih_net, qhi_net, decoder, args, qhih_att, qihi_att)
def build_ban(dataset, num_hid, op='', gamma=4, q_emb_type='bert', on_do_q=False, finetune_q=False): if 'bert' in q_emb_type: q_emb = BertModel.from_pretrained('bert-base-multilingual-cased') q_dim = 768 elif 'rg' in q_emb_type: w_dim = 100 q_dim = num_hid q_emb = RnnQuestionEmbedding(dataset.dictionary.ntoken, w_dim, q_dim, op) elif 'pkb' in q_emb_type: w_dim = 200 q_dim = num_hid q_emb = RnnQuestionEmbedding(dataset.dictionary.ntoken, w_dim, q_dim, op) if 'bertrnn' in q_emb_type: q_emb = BertRnnQuestionEmbedding(q_emb, 200, num_hid, op) q_dim = num_hid if not finetune_q: # Freeze question embedding if isinstance(q_emb, BertModel): for p in q_emb.parameters(): p.requires_grad_(False) else: for p in q_emb.w_emb.parameters(): p.requires_grad_(False) if not on_do_q: # Remove dropout of question embedding for m in q_emb.modules(): if isinstance(m, nn.Dropout): m.p = 0. v_att = BiAttention(dataset.v_dim, q_dim, num_hid, gamma) b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, q_dim, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, q_dim], '', .2)) c_prj.append(FCNet([objects + 1, q_dim], 'ReLU', .0)) classifiers = [SimpleClassifier(q_dim, num_hid * 2, dataset.num_ans_candidates, .5), SimpleClassifier(q_dim, num_hid * 2, 1, .5)] counter = Counter(objects) return BanModel(dataset, q_emb, v_att, b_net, q_prj, c_prj, classifiers, counter, op, gamma)
def build_baseline2(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([num_hid, num_hid]) v_net = nn.Linear(dataset.v_dim, 300) v_bn = nn.BatchNorm1d(300, momentum=0.01) lstm = nn.LSTM(300, num_hid, 1, batch_first=True) classifier = SimpleClassifier(num_hid, 2 * num_hid, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, lstm, v_bn)
def __init__(self, embed_hidden=300, mlp_hidden=512): super(TopDown, self).__init__() self.q_emb = nn.LSTM(embed_hidden, mlp_hidden, batch_first=True, bidirectional=True) self.q_prep = FCNet([mlp_hidden, mlp_hidden]) self.lstm_proj = nn.Linear(mlp_hidden * 2, mlp_hidden) self.verb_transform = nn.Linear(embed_hidden, mlp_hidden) self.v_att = Attention(mlp_hidden, mlp_hidden, mlp_hidden)
def __init__( self, encoder, gpu_mode, embed_hidden=300, mlp_hidden=512 ): super(E2ENetwork, self).__init__() self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.train_transform = tv.transforms.Compose([ tv.transforms.RandomRotation(10), tv.transforms.RandomResizedCrop(224), tv.transforms.RandomHorizontalFlip(), tv.transforms.ToTensor(), self.normalize, ]) self.dev_transform = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), self.normalize, ]) self.encoder = encoder self.gpu_mode = gpu_mode self.n_roles = self.encoder.get_num_roles() self.n_verbs = self.encoder.get_num_verbs() self.vocab_size = self.encoder.get_num_labels() self.max_role_count = self.encoder.get_max_role_count() self.conv = vgg16_modified() self.verb = nn.Sequential( linear(mlp_hidden*8, mlp_hidden*2), nn.BatchNorm1d(mlp_hidden*2), nn.ReLU(), nn.Dropout(0.5), linear(mlp_hidden*2, self.n_verbs), ) self.frcnn_lower = FCNet([mlp_hidden*4, mlp_hidden]) #todo: init embedding self.role_lookup = nn.Embedding(self.n_roles+1, embed_hidden, padding_idx=self.n_roles) self.verb_lookup = nn.Embedding(self.n_verbs, embed_hidden) self.role_labeller = MACNetwork(mlp_hidden, max_step=4, self_attention=False, memory_gate=False, classes=self.vocab_size) self.conv_hidden = self.conv.base_size() self.mlp_hidden = mlp_hidden self.embed_hidden = embed_hidden
def __init__(self, h, d_model, dropout=0.1): "Take in model size and number of heads." super(MultiHeadedAttention, self).__init__() assert d_model % h == 0 # We assume d_v always equals d_k self.d_k = d_model // h self.h = h self.linears = clones(nn.Linear(d_model, d_model), 3) self.attn = None self.dropout = nn.Dropout(p=dropout) self.rep = FCNet([d_model, d_model])
def __init__(self, v_dim, q_dim, h_dim, h_out, act='ReLU', dropout=[.2,.5], k=3): super(BCNet, self).__init__() self.c = 32 self.k = k self.v_dim = v_dim; self.q_dim = q_dim self.h_dim = h_dim; self.h_out = h_out self.v_net = FCNet([v_dim, h_dim * self.k], act=act, dropout=dropout[0]) self.q_net = FCNet([q_dim, h_dim * self.k], act=act, dropout=dropout[0]) self.dropout = nn.Dropout(dropout[1]) # attention if 1 < k: self.p_net = nn.AvgPool1d(self.k, stride=self.k) if None == h_out: pass elif h_out <= self.c: self.h_mat = nn.Parameter(torch.Tensor(1, h_out, 1, h_dim * self.k).normal_()) self.h_bias = nn.Parameter(torch.Tensor(1, h_out, 1, 1).normal_()) else: self.h_net = weight_norm(nn.Linear(h_dim * self.k, h_out), dim=None)
def __init__(self, v_dim, q_dim, num_hid, norm, act, bidirect=False, dropout=0.0): super(Base_Att, self).__init__() norm_layer = get_norm(norm) if (bidirect is False): self.nonlinear = FCNet([v_dim + q_dim, num_hid], dropout=dropout, norm=norm, act=act) else: self.nonlinear = FCNet([v_dim + 2 * q_dim, num_hid], dropout=dropout, norm=norm, act=act) self.linear = norm_layer(nn.Linear(num_hid, 1), dim=None)
def build_model_APx2(dataset, num_hid, dropout, norm, activation, dropL, dropG, dropW, dropC): w_emb = WordEmbedding(dataset.dictionary.ntoken, emb_dim=300, dropout=dropW) q_emb = QuestionEmbedding(in_dim=300, num_hid=num_hid, nlayers=1, bidirect=False, dropout=dropG, rnn_type='GRU') v_att_1 = Att_P(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) v_att_2 = Att_P(v_dim=dataset.v_dim, q_dim=q_emb.num_hid, num_hid=num_hid, dropout=dropout, norm=norm, act=activation) q_net = FCNet([q_emb.num_hid, num_hid], dropout=dropL, norm=norm, act=activation) v_net = FCNet([dataset.v_dim, num_hid], dropout=dropL, norm=norm, act=activation) classifier = SimpleClassifier(in_dim=num_hid, hid_dim=2 * num_hid, out_dim=dataset.num_ans_candidates, dropout=dropC, norm=norm, act=activation) return Model_2(w_emb, q_emb, v_att_1, v_att_2, q_net, v_net, classifier)
def build_fine(dataset, num_hid, args): cnn = getattr(resnet, args.cnn_model)() cnn.load_state_dict( torch.load(os.path.join(args.model_root, args.cnn_model + '.pth'))) my_cnn = myResnet(cnn) for param in my_cnn.parameters(): param.requires_grad = False for param in my_cnn.resnet.layer4.parameters(): param.requires_grad = True w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.4) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.4) v_att = DualAttention(dataset.v_dim, q_emb.num_hid, num_hid, 0.2) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) model = BaseModelWithCNN(w_emb, q_emb, v_att, q_net, v_net, classifier, my_cnn, args) return model
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa'): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) v_att = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2) q_net = FCNet([q_emb.num_hid, num_hid], 'Sigmoid') v_net = FCNet([dataset.v_dim, num_hid]) if task == 'vqa': b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, .5) counter = Counter(objects) return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, q_net, v_net, classifier, counter, op, gamma) elif task == 'flickr': return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
def __init__(self, v_dim, q_dim, a_dim, h_dim, h_out, rank, glimpse, act='ReLU', dropout=[.2, .5], k=1): super(TCNet, self).__init__() self.v_dim = v_dim self.q_dim = q_dim self.a_dim = a_dim self.h_out = h_out self.rank = rank self.h_dim = h_dim * k self.hv_dim = int(h_dim / rank) self.hq_dim = int(h_dim / rank) self.ha_dim = int(h_dim / rank) self.v_tucker = FCNet([v_dim, self.h_dim], act=act, dropout=dropout[1]) self.q_tucker = FCNet([q_dim, self.h_dim], act=act, dropout=dropout[0]) self.a_tucker = FCNet([a_dim, self.h_dim], act=act, dropout=dropout[0]) if self.h_dim < 1024: self.a_tucker = FCNet([a_dim, self.h_dim], act=act, dropout=dropout[0]) self.v_net = nn.ModuleList([ FCNet([self.h_dim, self.hv_dim], act=act, dropout=dropout[1]) for _ in range(rank) ]) self.q_net = nn.ModuleList([ FCNet([self.h_dim, self.hq_dim], act=act, dropout=dropout[0]) for _ in range(rank) ]) self.a_net = nn.ModuleList([ FCNet([self.h_dim, self.ha_dim], act=act, dropout=dropout[0]) for _ in range(rank) ]) if h_out > 1: self.ho_dim = int(h_out / rank) h_out = self.ho_dim self.T_g = nn.Parameter( torch.Tensor(1, rank, self.hv_dim, self.hq_dim, self.ha_dim, glimpse, h_out).normal_()) self.dropout = nn.Dropout(dropout[1])
def build_baseline0_newatt(dataset, num_hid, reconstruction, layer=4, size=64, variant='', finetune=False, use_residual=False, use_feat_loss=False, dropout_hid=False, dropout_unet=False, logger=None): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier, num_hid, dataset.v_dim, reconstruction, layer, size, variant, finetune, use_residual, use_feat_loss, dropout_hid, dropout_unet, logger)
def __init__(self, encoder, qword_embeddings, label_embedding, vocab_size, gpu_mode, mlp_hidden=512, embed_hidden=300): super(ImSituationHandler, self).__init__() self.encoder = encoder self.qword_embeddings = qword_embeddings self.label_embedding = label_embedding self.vocab_size = vocab_size self.gpu_mode = gpu_mode self.img_q_handler = RoleQHandler() self.img_label_handler = LabelHandler(self.label_embedding) self.q_label_handler = Role2LabelHandler(self.label_embedding) self.c_net = FCNet([mlp_hidden * 3, mlp_hidden]) self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.a_net = FCNet([embed_hidden, mlp_hidden]) self.sim_scorer = SimpleClassifier(mlp_hidden, mlp_hidden, 1, 0.5) self.mlp_hidden = mlp_hidden
def __init__(self, dim, self_attention=False, memory_gate=False): super().__init__() self.concat = FCNet([dim*2, dim]) #self.rnn = nn.GRUCell(dim, dim) if self_attention: self.attn = linear(dim, 1) self.mem = linear(dim, dim) if memory_gate: self.control = linear(dim, 1) self.self_attention = self_attention self.memory_gate = memory_gate self.dim = dim
def __init__(self, v_dim, q_dim, num_hid, dropout=0.2): super(DualAttention, self).__init__() self.v_proj1 = FCNet([v_dim, num_hid]) self.v_proj2 = FCNet([v_dim, num_hid]) self.q_proj1 = FCNet([q_dim, num_hid]) self.q_proj2 = FCNet([q_dim, num_hid]) self.dropout = nn.Dropout(dropout) self.on_repr1 = FCNet([num_hid, num_hid]) self.on_repr2 = FCNet([num_hid, num_hid]) self.linear1 = weight_norm(nn.Linear(num_hid, 1), dim=None) self.linear2 = weight_norm(nn.Linear(num_hid, 1), dim=None)
def __init__(self, v_dim, q_dim, num_hid): super(StackAttention1, self).__init__() # in this case we have q_dim = v_dim = num_hidden = 1024 self.input_size = v_dim self.fc_q1 = FCNet([q_dim, 768]) self.fc_q2 = FCNet([768, 640]) self.fc_v1 = FCNet([v_dim, 768]) self.fc_v2 = FCNet([768, 640]) self.att_size = 512 self.linear1 = FCNet([640, self.att_size]) self.fc_vq1 = FCNet([self.att_size, 1]) self.tan = nn.Tanh() self.dp = nn.Dropout(0.5)
def __init__(self, c_dim, num_hid, q_dim, nlayers, bidirect, dropout, rnn_type='LSTM', v_dim=2048): """Module for question embedding """ super(CaptionQuestionImageRNN0, self).__init__() assert rnn_type == 'LSTM' or rnn_type == 'GRU' rnn_cls = nn.LSTM if rnn_type == 'LSTM' else nn.GRU norm_layer = get_norm('weight') self.rnn_att = rnn_cls(c_dim, num_hid, nlayers, bidirectional=bidirect, dropout=dropout, batch_first=True) self.rnn_c = rnn_cls(c_dim, num_hid, nlayers, bidirectional=bidirect, dropout=dropout, batch_first=True) self.v_emb_for_c = FCNet([v_dim, num_hid], dropout=0.2, norm='weight', act='LeakyReLU') self.v_att_logits = norm_layer(nn.Linear(num_hid, 1), dim=None) self.Sig = nn.Sigmoid() self.c_dim = c_dim self.q_dim = q_dim self.num_hid = num_hid self.nlayers = nlayers self.ndirections = int(bidirect) + 1 self.rnn_type = rnn_type self.v_dim = v_dim
def __init__(self, encoder, gpu_mode, conv_hidden=24, embed_hidden=300, lstm_hidden=300, mlp_hidden=512): super(BaseModel, self).__init__() self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.train_transform = tv.transforms.Compose([ tv.transforms.RandomRotation(10), tv.transforms.RandomResizedCrop(224), tv.transforms.RandomHorizontalFlip(), tv.transforms.ToTensor(), self.normalize, ]) self.dev_transform = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), self.normalize, ]) self.encoder = encoder self.gpu_mode = gpu_mode self.vocab_size = self.encoder.get_num_labels() self.conv = vgg16_modified() self.agent = nn.Sequential( nn.Linear(mlp_hidden * 8, mlp_hidden * 2), nn.BatchNorm1d(mlp_hidden * 2), nn.ReLU(), nn.Dropout(0.5), ) self.oh_to_comp = FCNet( [385, mlp_hidden, mlp_hidden, mlp_hidden, mlp_hidden, mlp_hidden]) self.final_layer = nn.Linear(mlp_hidden * 3, self.vocab_size)
def __init__(self, hidden_dim=1024, dropout_=0.2, **kwargs): super(AttentionTextEmbedding, self).__init__() self.text_out_dim = hidden_dim * kwargs["conv2_out"] self.dropout = nn.Dropout(p=dropout_) conv1_out = kwargs["conv1_out"] conv2_out = kwargs["conv2_out"] self.transform = FCNet([hidden_dim, conv1_out], dropout=0.2) layers = [ nn.Dropout(0.2, inplace=False), weight_norm(nn.Linear(conv1_out, conv2_out), dim=None) ] # self.proj = nn.Linear(conv2_out * hidden_dim, self.text_out_dim) self.atte = nn.Sequential(*layers)
def build_vqae3_split(dataset, num_hid, att_dim, dec_dim): w_emb = WordEmbedding(dataset.question_dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att_1 = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net_1 = FCNet([q_emb.num_hid, num_hid]) v_net_1 = FCNet([dataset.v_dim, num_hid]) v_att_2 = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net_2 = FCNet([q_emb.num_hid, num_hid]) v_net_2 = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) generator = STDecoder( dataset.v_dim, num_hid, 300, dec_dim,\ dataset.explanation_dictionary.ntoken, 1, 0.5) e_emb = ExplainEmbedding(generator.embed, 300, num_hid, 1, False, 0.0, 'GRU') T_vq = FCNet([num_hid, num_hid]) T_e = FCNet([e_emb.num_hid, num_hid]) return Split_VQAE(w_emb, q_emb, v_att_1, q_net_1, v_net_1, v_att_2, q_net_2, v_net_2, classifier, generator, e_emb, T_vq, T_e)
def __init__(self, v_dim, q_dim, num_hid): super(Attention, self).__init__() self.nonlinear = FCNet([v_dim + q_dim, num_hid]) self.linear = weight_norm(nn.Linear(num_hid, 1), dim=None)
def __init__(self, encoder, gpu_mode, conv_hidden=24, embed_hidden=300, lstm_hidden=300, mlp_hidden=512): super(BaseModel, self).__init__() self.normalize = tv.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) self.train_transform = tv.transforms.Compose([ tv.transforms.RandomRotation(10), tv.transforms.RandomResizedCrop(224), tv.transforms.RandomHorizontalFlip(), tv.transforms.ToTensor(), self.normalize, ]) self.dev_transform = tv.transforms.Compose([ tv.transforms.Resize(224), tv.transforms.CenterCrop(224), tv.transforms.ToTensor(), self.normalize, ]) self.encoder = encoder self.gpu_mode = gpu_mode #self.vocab_size = self.encoder.get_num_labels() self.n_verbs = self.encoder.get_num_verbs() self.vocab_size = self.encoder.get_num_labels() self.q_word_count = len(self.encoder.question_words) self.conv_agent = vgg16_modified() self.conv_verb = vgg16_modified() self.agent = nn.Sequential(nn.Linear(mlp_hidden * 8, mlp_hidden * 2), nn.BatchNorm1d(mlp_hidden * 2), nn.ReLU(), nn.Dropout(0.5), nn.Linear(mlp_hidden * 2, self.vocab_size)) self.proj_obj = nn.Sequential(nn.Linear(mlp_hidden * 2, mlp_hidden), ) self.whole_img = nn.Sequential( nn.Linear(mlp_hidden * 8, mlp_hidden * 2), nn.BatchNorm1d(mlp_hidden * 2), nn.ReLU(), nn.Linear(mlp_hidden * 2, mlp_hidden), nn.Tanh()) self.agent4img = nn.Sequential(nn.Linear(mlp_hidden * 2, mlp_hidden), nn.Tanh()) self.verb = TopDown() self.w_emb = nn.Embedding(self.q_word_count, embed_hidden) self.agent_label_lookup = nn.Embedding(self.vocab_size, embed_hidden) self.q_net = FCNet([mlp_hidden, mlp_hidden]) self.v_net = FCNet([mlp_hidden, mlp_hidden]) self.classifier = SimpleClassifier(mlp_hidden, 2 * mlp_hidden, self.n_verbs, 0.5)