def __init__(self, in_features, mid_features, out_features, drop): super().__init__() self.drop = nn.Dropout(drop) self.lin11 = nn.Linear(in_features[0], mid_features) self.lin12 = nn.Linear(in_features[1], mid_features) self.lin2 = FCNet(mid_features, mid_features, activate='relu') self.lin3 = FCNet(mid_features, out_features)
def __init__(self, num_block, v_size, q_size, output_size, num_inter_head, num_intra_head, drop=0.0): super(MultiBlock, self).__init__() self.v_size = v_size self.q_size = q_size self.output_size = output_size self.num_inter_head = num_inter_head self.num_intra_head = num_intra_head self.num_block = num_block self.v_lin = FCNet(v_size, output_size, drop=drop) self.q_lin = FCNet(q_size, output_size, drop=drop) self.trans_lin = FCNet(q_size, output_size, drop=drop) blocks = [] for i in range(num_block): blocks.append( OneSideInterModalityUpdate(output_size, output_size, output_size, num_inter_head, drop)) blocks.append( OneSideInterModalityUpdate(output_size, output_size, output_size, num_inter_head, drop)) blocks.append( DyIntraModalityUpdate(output_size, output_size, output_size, num_intra_head, drop)) self.multi_blocks = nn.ModuleList(blocks)
def __init__(self, num_block, v_size, q_size, output_size, num_inter_head, num_intra_head, drop=0.0): super(SingleBlock, self).__init__() self.v_size = v_size self.q_size = q_size self.output_size = output_size self.num_inter_head = num_inter_head self.num_intra_head = num_intra_head self.num_block = num_block self.v_lin = FCNet(v_size, output_size, drop=drop) self.q_lin = FCNet(q_size, output_size, drop=drop) self.v2q_interBlock = OneSideInterModalityUpdate( output_size, output_size, output_size, num_inter_head, drop) self.q2v_interBlock = OneSideInterModalityUpdate( output_size, output_size, output_size, num_inter_head, drop) self.intraBlock = DyIntraModalityUpdate(output_size, output_size, output_size, num_intra_head, drop)
def __init__(self, in_features, mid_features, out_features, drop=0.0): super(Classifier, self).__init__() self.lin1 = FCNet(in_features, mid_features, activate='relu', drop=drop / 2.5) self.lin2 = FCNet(mid_features, out_features, drop=drop)
def __init__(self, src_size, tgt_size, output_size, num_head, drop=0.0): super(OneSideInterModalityUpdate, self).__init__() self.src_size = src_size self.tgt_size = tgt_size self.output_size = output_size self.num_head = num_head self.src_lin = FCNet(src_size, output_size * 2, drop=drop) self.tgt_lin = FCNet(tgt_size, output_size, drop=drop) self.tgt_output = FCNet(output_size + tgt_size, output_size, drop=drop)
def __init__(self, hidden_size, mid_size, num_token): super(TimeReasoningCell, self).__init__() self.linear_h = FCNet(hidden_size, mid_size, activate='relu') self.linear_q = FCNet(mid_size, mid_size, activate='relu') self.hidden_size = hidden_size # self.bilinear = modulars.BiAttention(mid_size, hidden_size, hidden_size, glimpse=1) self.encoder_prior = Encoder(hidden_size, 1, num_layers=0) self.encoder_posterior = Encoder(mid_size, 1, num_layers=0) self.encoder_back = Encoder(mid_size, 1, num_layers=0)
def __init__(self, v_features, q_features, mid_features, glimpses, drop=0.0): super(BiAttention, self).__init__() self.hidden_aug = 3 self.glimpses = glimpses self.lin_v = FCNet(v_features, int(mid_features * self.hidden_aug), activate='relu', drop=drop/2.5) # let self.lin take care of bias self.lin_q = FCNet(q_features, int(mid_features * self.hidden_aug), activate='relu', drop=drop/2.5) self.h_weight = nn.Parameter(torch.Tensor(1, glimpses, 1, int(mid_features * self.hidden_aug)).normal_()) self.h_bias = nn.Parameter(torch.Tensor(1, glimpses, 1, 1).normal_()) self.drop = nn.Dropout(drop)
def __init__(self, v_size, q_size, output_size, num_head, drop=0.0): super(InterModalityUpdate, self).__init__() self.v_size = v_size self.q_size = q_size self.output_size = output_size self.num_head = num_head self.v_lin = FCNet(v_size, output_size * 3, drop=drop) self.q_lin = FCNet(q_size, output_size * 3, drop=drop) self.v_output = FCNet(output_size + v_size, output_size, drop=drop) self.q_output = FCNet(output_size + q_size, output_size, drop=drop)
def __init__(self, **kwargs): """ kwargs: vocab, dim_hidden, dim_vision, mid_size, glimpses, dropout_prob, device, """ super(Net, self).__init__() for k, v in kwargs.items(): setattr(self, k, v) self.num_classes = len(self.vocab['answer_token_to_idx']) self.num_token = len(self.vocab['question_token_to_idx']) self.classifier = Classifier( in_features=(self.glimpses * self.dim_vision, self.mid_size), mid_features=self.dim_hidden, out_features=self.num_classes, drop=self.dropout_prob) self.token_embedding = nn.Embedding(self.num_token, self.dim_word) for p in self.token_embedding.parameters(): p.require_grad = False self.dropout = nn.Dropout(self.dropout_prob) self.text = modulars.BiGRUEncoder_2( dim_word=self.dim_word, dim_hidden=self.dim_hidden, ) self.vision_to_v = FCNet(self.dim_vision, self.dim_hidden, drop=0.3, bias=False) self.map_two_v_to_edge = FCNet(self.dim_hidden * 2, self.dim_edge, bias=False) self.timereasoning = TimeReasoning(hidden_size=self.dim_vision, mid_size=self.mid_size, state_size=self.state_size, num_token=self.num_token, edge_size=self.dim_edge)
def __init__(self, v_features, q_features, mid_features, num_obj, drop=0.0): super(ApplySingleAttention, self).__init__() self.lin_v = FCNet(v_features, mid_features, activate='relu', drop=drop) # let self.lin take care of bias self.lin_q = FCNet(q_features, mid_features, activate='relu', drop=drop) self.lin_atten = FCNet(mid_features, mid_features, drop=drop)
def __init__(self, v_size, q_size, output_size, num_head, drop=0.0): super(DyIntraModalityUpdate, self).__init__() self.v_size = v_size self.q_size = q_size self.output_size = output_size self.num_head = num_head self.v4q_gate_lin = FCNet(v_size, output_size, drop=drop) self.q4v_gate_lin = FCNet(q_size, output_size, drop=drop) self.v_lin = FCNet(v_size, output_size * 3, drop=drop) self.q_lin = FCNet(q_size, output_size * 3, drop=drop) self.v_output = FCNet(output_size, output_size, drop=drop) self.q_output = FCNet(output_size, output_size, drop=drop) self.relu = nn.ReLU() self.tanh = nn.Tanh() self.sigmoid = nn.Sigmoid()
def __init__(self, edge_size, hidden_size): super(TransCell, self).__init__() # self.linear_r = FCNet(edge_size, edge_size//2) self.linear_q = FCNet(hidden_size, edge_size) self.linear_out = FCNet(edge_size, 1, drop=0.1)
def __init__(self, **kwargs): """ kwargs: vocab, dim_hidden, dim_vision, mid_size, glimpses, dropout_prob, device, """ super(Net, self).__init__() for k, v in kwargs.items(): setattr(self, k, v) glimpses = 5 objects = 10 self.num_classes = len(self.vocab['answer_token_to_idx']) self.num_token = len(self.vocab['question_token_to_idx']) self.classifier = Classifier(in_features=self.mid_size, mid_features=self.dim_hidden * 2, out_features=self.num_classes, drop=self.dropout_prob) self.token_embedding = nn.Embedding(self.num_token, self.dim_word) for p in self.token_embedding.parameters(): p.require_grad = False self.dropout = nn.Dropout(self.dropout_prob) self.text = modulars.BiGRUEncoder_2( dim_word=self.dim_word, dim_hidden=self.dim_hidden, ) self.vision_to_v = FCNet(self.dim_vision, self.dim_hidden, drop=0.3, bias=False) self.map_two_v_to_edge = FCNet(self.dim_hidden * 2, self.dim_edge, bias=False) self.timereasoning = TimeReasoning(hidden_size=self.dim_vision, mid_size=self.mid_size, state_size=self.state_size, num_token=self.num_token, edge_size=self.dim_edge) self.count = Counter(objects) self.attention = weight_norm(BiAttention( v_features=self.dim_vision, q_features=self.dim_hidden, mid_features=self.dim_hidden, glimpses=glimpses, drop=0.5, ), name='h_weight', dim=None) self.apply_attention = ApplyAttention( v_features=self.dim_vision, q_features=self.dim_hidden, mid_features=self.dim_hidden, glimpses=glimpses, num_obj=objects, drop=0.2, )
def __init__(self, edge_size, hidden_size): super(TransCell, self).__init__() self.linear_r = FCNet(edge_size, 256) self.linear_q = FCNet(hidden_size, 256) self.linear_out = FCNet(256, 1)