Esempio n. 1
0
    def __init__(self, **kwargs):
        """
        kwargs:
             vocab,
             dim_hidden, 
             dim_vision,
             mid_size, 
             glimpses,
             dropout_prob,
             device,
        """
        super(Net, self).__init__()
        for k, v in kwargs.items():
            setattr(self, k, v)

        self.num_inter_head = 8
        self.num_intra_head = 8
        self.num_block = 6

        self.num_classes = len(self.vocab['answer_token_to_idx'])

        self.num_token = len(self.vocab['question_token_to_idx'])

        self.token_embedding = nn.Embedding(self.num_token, self.dim_word)

        for p in self.token_embedding.parameters():
            p.require_grad = False

        self.dropout = nn.Dropout(self.dropout_prob)

        self.text = modulars.BiGRUEncoder_2(
            dim_word=self.dim_word,
            dim_hidden=self.dim_hidden,
        )

        # self.vision_to_v = FCNet(self.dim_vision, self.dim_hidden, drop=0.3, bias=False)
        # self.map_two_v_to_edge = FCNet(self.dim_hidden*2, self.dim_edge, bias=False)

        self.timereasoning = TimeReasoning(hidden_size=self.dim_vision,
                                           mid_size=self.mid_size,
                                           state_size=self.state_size,
                                           num_token=self.num_token,
                                           edge_size=self.dim_edge)

        self.interIntraBlocks = MultiBlock(
            num_block=self.num_block,
            v_size=self.dim_vision,
            q_size=self.dim_hidden,
            output_size=self.mid_size,
            num_inter_head=self.num_inter_head,
            num_intra_head=self.num_intra_head,
            drop=0.1,
        )

        self.classifier = Classifier(
            in_features=self.mid_size,
            mid_features=2048,
            out_features=self.num_classes,
            drop=0.5,
        )
Esempio n. 2
0
    def __init__(self, **kwargs):
        """
        kwargs:
             vocab,
             dim_hidden, 
             dim_vision,
             mid_size, 
             glimpses,
             dropout_prob,
             T_ctrl,
             stack_len,
             device,
        """
        super(Net, self).__init__()
        for k, v in kwargs.items():
            setattr(self, k, v)

        self.num_classes = len(self.vocab['answer_token_to_idx'])

        self.num_token = len(self.vocab['question_token_to_idx'])

        self.classifier = Classifier(
            in_features=(self.glimpses * self.dim_vision, self.dim_hidden),
            mid_features=self.mid_size,
            out_features=self.num_classes,
            drop=self.dropout_prob)

        self.token_embedding = nn.Embedding(self.num_token, self.dim_word)
        self.dropout = nn.Dropout(self.dropout_prob)

        self.text = modulars.BiGRUEncoder_2(
            dim_word=self.dim_word,
            dim_hidden=self.dim_hidden,
        )

        self.vision_to_v = FCNet(self.dim_vision,
                                 self.dim_hidden,
                                 drop=0.3,
                                 bias=False)
        # self.map_two_v_to_edge = FCNet(self.dim_hidden*2, self.dim_edge, bias=False)

        self.timereasoning = TimeReasoning(
            hidden_size=self.dim_vision,
            mid_size=self.dim_hidden,
            state_size=self.state_size,
            num_token=self.num_token,
            edge_size=self.dim_edge,
        )
Esempio n. 3
0
    def __init__(self, **kwargs):
        """
        kwargs:
             vocab,
             dim_hidden, 
             dim_vision,
             mid_size, 
             glimpses,
             dropout_prob,
             device,
        """
        super(Net, self).__init__()
        for k, v in kwargs.items():
            setattr(self, k, v)

        glimpses = 5
        objects = 10

        self.num_classes = len(self.vocab['answer_token_to_idx'])

        self.num_token = len(self.vocab['question_token_to_idx'])

        self.classifier = Classifier(in_features=self.mid_size,
                                     mid_features=self.dim_hidden * 2,
                                     out_features=self.num_classes,
                                     drop=self.dropout_prob)

        self.token_embedding = nn.Embedding(self.num_token, self.dim_word)

        for p in self.token_embedding.parameters():
            p.require_grad = False

        self.dropout = nn.Dropout(self.dropout_prob)

        self.text = modulars.BiGRUEncoder_2(
            dim_word=self.dim_word,
            dim_hidden=self.dim_hidden,
        )

        self.vision_to_v = FCNet(self.dim_vision,
                                 self.dim_hidden,
                                 drop=0.3,
                                 bias=False)
        self.map_two_v_to_edge = FCNet(self.dim_hidden * 2,
                                       self.dim_edge,
                                       bias=False)

        self.timereasoning = TimeReasoning(hidden_size=self.dim_vision,
                                           mid_size=self.mid_size,
                                           state_size=self.state_size,
                                           num_token=self.num_token,
                                           edge_size=self.dim_edge)

        self.count = Counter(objects)

        self.attention = weight_norm(BiAttention(
            v_features=self.dim_vision,
            q_features=self.dim_hidden,
            mid_features=self.dim_hidden,
            glimpses=glimpses,
            drop=0.5,
        ),
                                     name='h_weight',
                                     dim=None)

        self.apply_attention = ApplyAttention(
            v_features=self.dim_vision,
            q_features=self.dim_hidden,
            mid_features=self.dim_hidden,
            glimpses=glimpses,
            num_obj=objects,
            drop=0.2,
        )