def __init__(self, v_relation_dim, num_hid, gamma, min_num_objects=10, use_counter=True): super(BAN, self).__init__() self.v_att = BiAttention(v_relation_dim, num_hid, num_hid, gamma) self.glimpse = gamma self.use_counter = use_counter b_net = [] q_prj = [] c_prj = [] q_att = [] v_prj = [] for i in range(gamma): b_net.append(BCNet(v_relation_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) if self.use_counter: c_prj.append(FCNet([min_num_objects + 1, num_hid], 'ReLU', .0)) self.b_net = nn.ModuleList(b_net) self.q_prj = nn.ModuleList(q_prj) self.q_att = nn.ModuleList(q_att) self.v_prj = nn.ModuleList(v_prj) if self.use_counter: self.c_prj = nn.ModuleList(c_prj) self.counter = Counter(min_num_objects)
def baseline(args, dataset, pretrained=False): # initialise model w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, args.num_hid, 1, False, 0.0) v_att = Attention(dataset.v_dim, q_emb.num_hid, args.num_hid) q_net = FCNet([args.num_hid, args.num_hid]) v_net = FCNet([dataset.v_dim, args.num_hid]) classifier = SimpleClassifier(args.num_hid, 2 * args.num_hid, dataset.num_ans_candidates, 0.5) model = BaseModel(args, w_emb, q_emb, v_att, q_net, v_net, classifier) # load model on device if available map_location = None if not model.cuda_available: map_location = torch.device('cpu') # download and load pretrained model if pretrained: key = 'baseline-vqa' url = pretrained_urls[key] model.load_state_dict(download_model( key, url, map_location=map_location)['model'], strict=False) else: key = 'untrained' # set model name model.name = key return model
def __init__(self, v_dim, q_dim, hid_dim, glimpses=1, dropout=0.2): super(Attention, self).__init__() self.v_proj = FCNet([v_dim, hid_dim], dropout) self.q_proj = FCNet([q_dim, hid_dim], dropout) self.drop = nn.Dropout(dropout) self.linear = weight_norm(nn.Linear(hid_dim, glimpses), dim=None)
def __init__(self, v_dim, q_dim, num_hid, dropout=0.2): super(NewAttention, self).__init__() self.v_proj = FCNet([v_dim, num_hid]) self.q_proj = FCNet([q_dim, num_hid]) self.dropout = nn.Dropout(dropout) self.linear = weight_norm(nn.Linear(q_dim, 1), dim=None)
def __init__(self, v_dim, q_dim, h_dim, h_out, act='ReLU', dropout=[.2, .5], k=3): super(BCNet, self).__init__() self.c = 32 self.k = k self.v_dim = v_dim self.q_dim = q_dim self.h_dim = h_dim self.h_out = h_out self.v_net = FCNet([v_dim, h_dim * self.k], act=act, dropout=dropout[0]) self.q_net = FCNet([q_dim, h_dim * self.k], act=act, dropout=dropout[0]) self.dropout = nn.Dropout(dropout[1]) # attention if 1 < k: self.p_net = nn.AvgPool1d(self.k, stride=self.k) if h_out is None: pass elif h_out <= self.c: self.h_mat = nn.Parameter( torch.Tensor(1, h_out, 1, h_dim * self.k).normal_()) self.h_bias = nn.Parameter( torch.Tensor(1, h_out, 1, 1).normal_()) else: self.h_net = weight_norm( nn.Linear(h_dim * self.k, h_out), dim=None)
def __init__(self, in_dim, hid_dim, out_dim, dropout=0.0): super(SimpleClassifier, self).__init__() self.q_net = FCNet([in_dim[0], hid_dim[0]], dropout) self.v_net = FCNet([in_dim[1], hid_dim[0]], dropout) self.main = nn.Sequential(nn.Linear(hid_dim[0], hid_dim[1]), nn.ReLU(), nn.Dropout(dropout, inplace=True), nn.Linear(hid_dim[1], out_dim))
def __init__(self, v_relation_dim, q_dim, num_hid, dropout=0.2): super(BUTD, self).__init__() self.v_proj = FCNet([v_relation_dim, num_hid]) self.q_proj = FCNet([q_dim, num_hid]) self.dropout = nn.Dropout(dropout) self.linear = FCNet([q_dim, 1]) self.q_net = FCNet([q_dim, num_hid]) self.v_net = FCNet([v_relation_dim, num_hid])
def __init__(self, num_hid, dropout): super(QuestionSelfAttention, self).__init__() self.num_hid = num_hid self.drop = nn.Dropout(dropout) self.W1_self_att_q = FCNet(dims=[num_hid, num_hid], dropout=dropout, act=None) self.W2_self_att_q = FCNet(dims=[num_hid, 1], act=None)
def __init__(self, dim_v, dim_q, dim_out, method="Mutan", mlp_glimpses=0): super(MuTAN_Attention, self).__init__() self.mlp_glimpses = mlp_glimpses self.fusion = getattr(fusions, method)( [dim_q, dim_v], dim_out, mm_dim=1200, dropout_input=0.1) if self.mlp_glimpses > 0: self.linear0 = FCNet([dim_out, 512], '', 0) self.linear1 = FCNet([512, mlp_glimpses], '', 0)
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, q_net, v_net, classifier)
def __init__(self, feat_dim, nongt_dim=20, pos_emb_dim=-1, num_heads=16, dropout=[0.2, 0.5]): """ Attetion module with vectorized version 自注意力的KVQ:其中K和Q是来自原始特征和其K的权重矩阵W_k的转换得来的——K=X*W_k,同理Q也是这样, 而V可以通过转换,W_v得来,也可以是原始特征X一个嵌入即可。所以,K和Q需要一个网络来学习其权重矩阵,W_k,W_q Args: position_embedding: [num_rois, nongt_dim, pos_emb_dim] 用于隐式关系 used in implicit relation pos_emb_dim: set as -1 if explicit relation 如果关系显式,设为-1 nongt_dim: number of objects consider relations per image 对象的数量考虑每个图像的关系 fc_dim: should be same as num_heads feat_dim: dimension of roi_feat num_heads: number of attention heads Returns: output: [num_rois, ovr_feat_dim, output_dim] """ super(GraphSelfAttentionLayer, self).__init__() # multi head self.fc_dim = num_heads #16 self.feat_dim = feat_dim #1024 self.dim = (feat_dim, feat_dim, feat_dim) self.dim_group = (int(self.dim[0] / num_heads), int(self.dim[1] / num_heads), int(self.dim[2] / num_heads))#64,64,64 self.num_heads = num_heads #16 self.pos_emb_dim = pos_emb_dim #隐式关系64,显式关系-1 #如果是隐式关系执行 多一层pair_pos_fc1网络 if self.pos_emb_dim > 0: self.pair_pos_fc1 = FCNet([pos_emb_dim, self.fc_dim], None, dropout[0]) #【64,16,None】 self.query = FCNet([feat_dim, self.dim[0]], None, dropout[0]) #非线性全连接层作为查询【1024,1024】 # self.query = # (query): FCNet( # (main): Sequential( # (0): Dropout(p=0.2, inplace=False) # (1): Linear(in_features=1024, out_features=1024, bias=True) # ) # ) self.nongt_dim = nongt_dim #20 self.key = FCNet([feat_dim, self.dim[1]], None, dropout[0]) ##非线性全连接层作为键【1024,1024】 # self.key = # (key): FCNet( # (main): Sequential( # (0): Dropout(p=0.2, inplace=False) # (1): Linear(in_features=1024, out_features=1024, bias=True) ) # ) #weight_norm:pytorch自带的权重归一化[16*1024,1024,(1,1)] #卷积层:卷积计算 self.linear_out_ = weight_norm(nn.Conv2d(in_channels=self.fc_dim * feat_dim, out_channels=self.dim[2], kernel_size=(1, 1), groups=self.fc_dim), dim=None)
def __init__(self, v_dim, q_dim, out_dim, dir_num, label_num, nongt_dim=20, num_heads=16, num_steps=1, residual_connection=True, label_bias=True): super(ExplicitRelationEncoder, self).__init__() self.v_dim = v_dim self.q_dim = q_dim self.out_dim = out_dim self.num_steps = num_steps self.residual_connection = residual_connection print( "In ExplicitRelationEncoder, num of graph propogation steps:", "%d, residual_connection: %s" % (self.num_steps, self.residual_connection)) if self.v_dim != self.out_dim: self.v_transform = FCNet([v_dim, out_dim]) else: self.v_transform = None in_dim = out_dim + q_dim self.explicit_relation = GAT(dir_num, label_num, in_dim, out_dim, nongt_dim=nongt_dim, num_heads=num_heads, label_bias=label_bias, pos_emb_dim=-1)
def __init__(self, feat_dim, nongt_dim=20, pos_emb_dim=-1, num_heads=16, dropout=[0.2, 0.5]): """ Attetion module with vectorized version Args: position_embedding: [num_rois, nongt_dim, pos_emb_dim] used in implicit relation pos_emb_dim: set as -1 if explicit relation nongt_dim: number of objects consider relations per image fc_dim: should be same as num_heads feat_dim: dimension of roi_feat num_heads: number of attention heads Returns: output: [num_rois, ovr_feat_dim, output_dim] """ super(GraphSelfAttentionLayer, self).__init__() # multi head self.fc_dim = num_heads self.feat_dim = feat_dim self.dim = (feat_dim, feat_dim, feat_dim) self.dim_group = (int(self.dim[0] / num_heads), int(self.dim[1] / num_heads), int(self.dim[2] / num_heads)) self.num_heads = num_heads self.pos_emb_dim = pos_emb_dim if self.pos_emb_dim > 0: self.pair_pos_fc1 = FCNet([pos_emb_dim, self.fc_dim], None, dropout[0]) self.query = FCNet([feat_dim, self.dim[0]], None, dropout[0]) self.nongt_dim = nongt_dim self.key = FCNet([feat_dim, self.dim[1]], None, dropout[0]) self.linear_out_ = weight_norm(nn.Conv2d(in_channels=self.fc_dim * feat_dim, out_channels=self.dim[2], kernel_size=(1, 1), groups=self.fc_dim), dim=None)
def __init__(self, dir_num, label_num, in_feat_dim, out_feat_dim, nongt_dim=20, dropout=0.2, label_bias=True, num_heads=16, pos_emb_dim=-1): """ Attetion module with vectorized version Args: label_num: numer of edge labels dir_num: number of edge directions feat_dim: dimension of roi_feat pos_emb_dim: dimension of postion embedding for implicit relation, set as -1 for explicit relation Returns: output: [num_rois, ovr_feat_dim, output_dim] """ super(GAttNet, self).__init__() assert dir_num <= 2, "Got more than two directions in a graph." self.dir_num = dir_num self.label_num = label_num self.in_feat_dim = in_feat_dim self.out_feat_dim = out_feat_dim self.dropout = nn.Dropout(dropout) self.self_weights = FCNet([in_feat_dim, out_feat_dim], '', dropout) self.bias = FCNet([label_num, 1], '', 0, label_bias) self.nongt_dim = nongt_dim self.pos_emb_dim = pos_emb_dim neighbor_net = [] for i in range(dir_num): g_att_layer = GraphSelfAttentionLayer(pos_emb_dim=pos_emb_dim, num_heads=num_heads, feat_dim=out_feat_dim, nongt_dim=nongt_dim) neighbor_net.append(g_att_layer) self.neighbor_net = nn.ModuleList(neighbor_net)
def __init__(self, dir_num, label_num, in_feat_dim, out_feat_dim, nongt_dim=20, dropout=0.2, label_bias=True, num_heads=16, pos_emb_dim=-1): super(GAttNet, self).__init__() assert dir_num <= 2, "Got more than two directions in a graph." self.dir_num = dir_num #2 self.label_num = label_num #11 self.in_feat_dim = in_feat_dim #2048 self.out_feat_dim = out_feat_dim #1024 self.dropout = nn.Dropout(dropout) #0.2 # FCNet((main): Sequential( # (0): Dropout(p=0.2, inplace=False) # (1): Linear(in_features=2048, out_features=1024, bias=True))) self.self_weights = FCNet([in_feat_dim, out_feat_dim], '', dropout) # FCNet( # (main): Sequential( # (0): Linear(in_features=11, out_features=1, bias=False))) self.bias = FCNet([label_num, 1], '', 0, label_bias) self.nongt_dim = nongt_dim #20 self.pos_emb_dim = pos_emb_dim #-1 neighbor_net = [] #两层图注意层 for i in range(dir_num): g_att_layer = GraphSelfAttentionLayer(pos_emb_dim=pos_emb_dim, num_heads=num_heads, feat_dim=out_feat_dim, nongt_dim=nongt_dim) neighbor_net.append(g_att_layer) #图注意力网络 self.neighbor_net = nn.ModuleList(neighbor_net)
def __init__(self, v_dim, q_dim, num_hid): super(Attention, self).__init__() self.nonlinear = FCNet([v_dim + q_dim, num_hid]) self.linear = weight_norm(nn.Linear(num_hid, 1), dim=None)