def __init__(self, **kwargs): """ kwargs: vocab, dim_hidden, dim_vision, mid_size, glimpses, dropout_prob, device, """ super(Net, self).__init__() for k, v in kwargs.items(): setattr(self, k, v) self.num_inter_head = 8 self.num_intra_head = 8 self.num_block = 6 self.num_classes = len(self.vocab['answer_token_to_idx']) self.num_token = len(self.vocab['question_token_to_idx']) self.token_embedding = nn.Embedding(self.num_token, self.dim_word) for p in self.token_embedding.parameters(): p.require_grad = False self.dropout = nn.Dropout(self.dropout_prob) self.text = modulars.BiGRUEncoder_2( dim_word=self.dim_word, dim_hidden=self.dim_hidden, ) # self.vision_to_v = FCNet(self.dim_vision, self.dim_hidden, drop=0.3, bias=False) # self.map_two_v_to_edge = FCNet(self.dim_hidden*2, self.dim_edge, bias=False) self.timereasoning = TimeReasoning(hidden_size=self.dim_vision, mid_size=self.mid_size, state_size=self.state_size, num_token=self.num_token, edge_size=self.dim_edge) self.interIntraBlocks = MultiBlock( num_block=self.num_block, v_size=self.dim_vision, q_size=self.dim_hidden, output_size=self.mid_size, num_inter_head=self.num_inter_head, num_intra_head=self.num_intra_head, drop=0.1, ) self.classifier = Classifier( in_features=self.mid_size, mid_features=2048, out_features=self.num_classes, drop=0.5, )
def __init__(self, **kwargs): """ kwargs: vocab, dim_hidden, dim_vision, mid_size, glimpses, dropout_prob, T_ctrl, stack_len, device, """ super(Net, self).__init__() for k, v in kwargs.items(): setattr(self, k, v) self.num_classes = len(self.vocab['answer_token_to_idx']) self.num_token = len(self.vocab['question_token_to_idx']) self.classifier = Classifier( in_features=(self.glimpses * self.dim_vision, self.dim_hidden), mid_features=self.mid_size, out_features=self.num_classes, drop=self.dropout_prob) self.token_embedding = nn.Embedding(self.num_token, self.dim_word) self.dropout = nn.Dropout(self.dropout_prob) self.text = modulars.BiGRUEncoder_2( dim_word=self.dim_word, dim_hidden=self.dim_hidden, ) self.vision_to_v = FCNet(self.dim_vision, self.dim_hidden, drop=0.3, bias=False) # self.map_two_v_to_edge = FCNet(self.dim_hidden*2, self.dim_edge, bias=False) self.timereasoning = TimeReasoning( hidden_size=self.dim_vision, mid_size=self.dim_hidden, state_size=self.state_size, num_token=self.num_token, edge_size=self.dim_edge, )
def __init__(self, **kwargs): """ kwargs: vocab, dim_hidden, dim_vision, mid_size, glimpses, dropout_prob, device, """ super(Net, self).__init__() for k, v in kwargs.items(): setattr(self, k, v) glimpses = 5 objects = 10 self.num_classes = len(self.vocab['answer_token_to_idx']) self.num_token = len(self.vocab['question_token_to_idx']) self.classifier = Classifier(in_features=self.mid_size, mid_features=self.dim_hidden * 2, out_features=self.num_classes, drop=self.dropout_prob) self.token_embedding = nn.Embedding(self.num_token, self.dim_word) for p in self.token_embedding.parameters(): p.require_grad = False self.dropout = nn.Dropout(self.dropout_prob) self.text = modulars.BiGRUEncoder_2( dim_word=self.dim_word, dim_hidden=self.dim_hidden, ) self.vision_to_v = FCNet(self.dim_vision, self.dim_hidden, drop=0.3, bias=False) self.map_two_v_to_edge = FCNet(self.dim_hidden * 2, self.dim_edge, bias=False) self.timereasoning = TimeReasoning(hidden_size=self.dim_vision, mid_size=self.mid_size, state_size=self.state_size, num_token=self.num_token, edge_size=self.dim_edge) self.count = Counter(objects) self.attention = weight_norm(BiAttention( v_features=self.dim_vision, q_features=self.dim_hidden, mid_features=self.dim_hidden, glimpses=glimpses, drop=0.5, ), name='h_weight', dim=None) self.apply_attention = ApplyAttention( v_features=self.dim_vision, q_features=self.dim_hidden, mid_features=self.dim_hidden, glimpses=glimpses, num_obj=objects, drop=0.2, )