Esempio n. 1
0
    def __getitem__(self, idx):
        idx = self.movie_idx[idx]
        F = self.data[idx][2]
        A = self.data[idx][1]
        T = self.data[idx][3]
        y = self.data[idx][4]

        combined = np.hstack([F, A, T])
        #shape: timestamps*sum of dim_modality

        # Convert to torch tensors
        F = torch.Tensor(F)
        A = torch.Tensor(A)
        T = torch.Tensor(T)
        # y = torch.Tensor(y)

        # Instantiate fusion classes
        FA = fusions.Block([F.shape[1], A.shape[1]], T.shape[1])
        FAT = fusions.Block([T.shape[1], T.shape[1]],
                            F.shape[1] + A.shape[1] + T.shape[1])

        # compute fusions
        temp_output_FA = FA([F, A])
        final_FAT = FAT([temp_output_FA, T])

        # return final_FAT.cpu().detach().numpy(), y
        return combined, y, F, A, T
Esempio n. 2
0
    def __init__(self, image_model, fusion_method, config):
        super(TextImageGazeBertEncoder, self).__init__()

        self.hidden_size = config.hidden_size
        self.fusion_method = fusion_method
        if fusion_method == 'concat':
            self.fc = nn.Linear(self.hidden_size * 2, self.hidden_size)
        elif fusion_method == 'mcb':
            self.fusion = fusions.MCB([self.hidden_size, self.hidden_size],
                                      self.hidden_size)
        elif fusion_method == 'mlb':
            self.fusion = fusions.MLB([self.hidden_size, self.hidden_size],
                                      self.hidden_size)
        elif fusion_method == 'mutan':
            self.fusion = fusions.Mutan([self.hidden_size, self.hidden_size],
                                        self.hidden_size)
        elif fusion_method == 'block':
            self.fusion = fusions.Block([self.hidden_size, self.hidden_size],
                                        self.hidden_size)

        if image_model == 'vgg':
            from model.vgg import VggEncoder
            self.image_gaze_encoder = VggEncoder(self.hidden_size, gaze=True)
        elif image_model == 'resnet':
            from model.resnet import ResNetEncoder
            self.image_gaze_encoder = ResNetEncoder(self.hidden_size,
                                                    gaze=True)

        from model.bert import BertEncoder
        self.text_encoder = BertEncoder(config)
        M = torch.FloatTensor(self.hidden_size, self.hidden_size)
        init.xavier_normal_(M)
        self.M = nn.Parameter(M, requires_grad=True)
    def __init__(self, cfg):
        super(RelationVKG, self).__init__()

        q_dim = cfg['rnn_dim']*2 if cfg['rnn_bidirection'] else cfg['rnn_dim']
        self.w_emb = WordEmbedding(cfg['n_vocab'], cfg['word_embedding_dim'])
        self.w_emb.init_embedding(cfg['word_dic_file'], cfg['embedding_file'])
        self.q_emb = QuestionEmbedding(cfg['word_embedding_dim'], cfg['rnn_dim'], cfg['rnn_layer'],
                                      cfg['rnn_type'], keep_seq=True, bidirectional=cfg['rnn_bidirection'])
        self.reasoning_net = ReasoningUnit(cfg['v_dim'], q_dim, cfg['rel_dim'], cfg['node_att_hid_dim'],
                                           gat_att_hid_dim=cfg['gat_att_hid_dim'], gat_out_dim=cfg['v_dim'],
                                           gat_n_att=cfg['gat_n_att'],
                                           gat_multi_head_type="concat",
                                           que_self_att_enable=cfg['ques_self_att_enable'],
                                           node_att_enable=cfg['node_att_enable'], gat_enable=cfg['gat_enable'],
                                           spatial_feature_enable=cfg['spatial_feature_enable'], recurrent=cfg['recurrent'],
                                           dropout=cfg['dropout'], wn=cfg['wn'])
        if cfg['fuse_type'] == 'LinearSum':
            self.fuse_net = fusions.LinearSum([cfg['v_dim'], q_dim], cfg['fused_dim'], dropout_input=cfg['dropout'])
        if cfg['fuse_type'] == 'MFB':
            self.fuse_net = fusions.MFB([cfg['v_dim'], q_dim], cfg['fused_dim'], dropout_input=cfg['dropout'])
        if cfg['fuse_type'] == 'MLB':
            self.fuse_net = fusions.MLB([cfg['v_dim'], q_dim], cfg['fused_dim'], mm_dim=2*cfg['fused_dim'], dropout_input=cfg['dropout'])
        if cfg['fuse_type'] == 'BLOCK':
            self.fuse_net = fusions.Block([cfg['v_dim'], q_dim], cfg['fused_dim'], mm_dim=2*cfg['fused_dim'], dropout_input=cfg['dropout'])
        self.classifier = SimpleClassifier(cfg['fused_dim'], cfg['classifier_hid_dim'], cfg['classes'], 0.5)
Esempio n. 4
0
    def __init__(self, image_model, fusion_method, id_to_vec, emb_size, vocab_size, config, device='cuda:0'):
        super(TextImageTransformerEncoder, self).__init__()

        self.hidden_size = config.hidden_size
        self.fusion_method = fusion_method
        if fusion_method == 'concat':
            self.fc = nn.Linear(self.hidden_size*2, self.hidden_size)
        elif fusion_method == 'mcb':
            self.fusion = fusions.MCB([self.hidden_size, self.hidden_size], self.hidden_size)
        elif fusion_method == 'mlb':
            self.fusion = fusions.MLB([self.hidden_size, self.hidden_size], self.hidden_size)
        elif fusion_method == 'mutan':
            self.fusion = fusions.Mutan([self.hidden_size, self.hidden_size], self.hidden_size)
        elif fusion_method == 'block':
            self.fusion = fusions.Block([self.hidden_size, self.hidden_size], self.hidden_size)

        if image_model == 'vgg':
            from model.vgg import VggEncoder
            self.image_encoder = VggEncoder(self.hidden_size)
        elif image_model == 'resnet':
            from model.resnet import ResNetEncoder
            self.image_encoder = ResNetEncoder(self.hidden_size)

        from model.transformer import TransformerEncoder
        self.context_encoder = TransformerEncoder(id_to_vec, emb_size, vocab_size, config, device)
        self.response_encoder = TransformerEncoder(id_to_vec, emb_size, vocab_size, config, device)
        M = torch.FloatTensor(self.hidden_size, self.hidden_size)
        init.xavier_normal_(M)
        self.M = nn.Parameter(M, requires_grad=True)
Esempio n. 5
0
    def __init__(self, vid_encoder, qns_encoder, ans_decoder, max_len_v,
                 max_len_q, device):
        """
        Reasoning with Heterogeneous Graph Alignment for Video Question Answering (AAAI20)
        """
        super(HGA, self).__init__()
        self.vid_encoder = vid_encoder
        self.qns_encoder = qns_encoder
        self.ans_decoder = ans_decoder
        self.max_len_v = max_len_v
        self.max_len_q = max_len_q
        self.device = device
        hidden_size = vid_encoder.dim_hidden
        input_dropout_p = vid_encoder.input_dropout_p

        self.q_input_ln = nn.LayerNorm(hidden_size, elementwise_affine=False)
        self.v_input_ln = nn.LayerNorm(hidden_size, elementwise_affine=False)

        self.co_attn = CoAttention(hidden_size,
                                   n_layers=vid_encoder.n_layers,
                                   dropout_p=input_dropout_p)

        self.adj_learner = AdjLearner(hidden_size,
                                      hidden_size,
                                      dropout=input_dropout_p)

        self.gcn = GCN(hidden_size,
                       hidden_size,
                       hidden_size,
                       num_layers=2,
                       dropout=input_dropout_p)

        self.gcn_atten_pool = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2), nn.Tanh(),
            nn.Linear(hidden_size // 2, 1), nn.Softmax(
                dim=-1))  #dim=-2 for attention-pooling otherwise sum-pooling

        self.global_fusion = fusions.Block([hidden_size, hidden_size],
                                           hidden_size,
                                           dropout_input=input_dropout_p)

        self.fusion = fusions.Block([hidden_size, hidden_size], hidden_size)
    def __getitem__(self, idx):
        idx = self.movie_idx[idx]
        F = self.new_data[idx][1]
        Va = self.new_data[idx][2]
        emb_desc = self.new_data[idx][3]
        emb_sit = self.new_data[idx][4]
        emb_sce = self.new_data[idx][5]
        emb_trans = self.new_data[idx][6]
        y = self.new_data[idx][7]

        combined = np.hstack([F, Va, emb_desc, emb_sit, emb_sce, emb_trans])

        F = torch.Tensor(F)
        Va = torch.Tensor(Va)
        emb_desc = torch.Tensor(emb_desc)
        emb_sit = torch.Tensor(emb_sit)
        emb_sce = torch.Tensor(emb_sce)
        emb_trans = torch.Tensor(emb_trans)
        # Instantiate fusion classes
        fusion1 = fusions.Block([F.shape[1], Va.shape[1]], emb_desc.shape[1])
        fusion2 = fusions.Block([emb_desc.shape[1], emb_desc.shape[1]],
                                F.shape[1] + Va.shape[1] + emb_desc.shape[1])

        fusion3 = fusions.Block([emb_sit.shape[1], emb_sce.shape[1]],
                                emb_trans.shape[1])
        fusion4 = fusions.Block([emb_trans.shape[1], emb_trans.shape[1]],
                                emb_sit.shape[1] + emb_sce.shape[1] +
                                emb_trans.shape[1])

        # compute fusions
        temp_output_fusion1 = fusion1([F, Va])
        first_three = fusion2([temp_output_fusion1, emb_desc])
        temp_output_fusion2 = fusion3([emb_sit, emb_sce])
        second_three = fusion4([temp_output_fusion2, emb_trans])

        fusion5 = fusions.Block([first_three.shape[1], second_three.shape[1]],
                                first_three.shape[1] + second_three.shape[1])
        final_fused = fusion5([first_three, second_three])
        return combined, y, F, Va, emb_desc, emb_sit, emb_sce, emb_trans
    def __init__(self,
                 config,
                 max_num_region=200,
                 chunks=50,
                 default_gpu=True,
                 dropout_prob=0.1):
        super(Vilbert, self).__init__(config)

        self.vilbert = BertModel(config)
        self.fusion = fusions.Block(input_dims=[
            max_num_region * config.v_hidden_size, config.bi_hidden_size
        ],
                                    output_dim=max_num_region,
                                    mm_dim=512,
                                    chunks=100)
        self.dropout = nn.Dropout(dropout_prob)
        self.vision_logit = nn.Linear(config.v_hidden_size, 1)
    def __init__(self,
                 vocab_size,
                 s_layers,
                 s_bidirectional,
                 s_rnn_cell,
                 s_embedding,
                 resnet_input_size,
                 c3d_input_size,
                 v_layers,
                 v_bidirectional,
                 v_rnn_cell,
                 hidden_size,
                 dropout_p=0.0,
                 gcn_layers=2,
                 num_heads=8,
                 answer_vocab_size=None,
                 q_max_len=35,
                 v_max_len=80,
                 tf_layers=2,
                 two_loss=False,
                 fusion_type='none',
                 ablation='none'):
        super().__init__()

        self.model_name = 'TwoLSTMandBlock'
        self.task = 'none'
        self.tf_layers = tf_layers
        self.two_loss = two_loss
        self.fusion_type = fusion_type
        self.ablation = ablation
        v_input_size = resnet_input_size
        self.q_max_len = q_max_len
        self.v_max_len = v_max_len

        self.dropout = nn.Dropout(p=dropout_p)

        self.q_input_ln = nn.LayerNorm(hidden_size, elementwise_affine=False)
        self.v_input_ln = nn.LayerNorm(hidden_size, elementwise_affine=False)

        self.sentence_encoder = SentenceEncoderRNN(
            vocab_size,
            hidden_size,
            input_dropout_p=dropout_p,
            dropout_p=dropout_p,
            n_layers=s_layers,
            bidirectional=s_bidirectional,
            rnn_cell=s_rnn_cell,
            embedding=s_embedding)

        self.compress_c3d = nlpnn.WeightDropLinear(c3d_input_size,
                                                   resnet_input_size,
                                                   weight_dropout=dropout_p,
                                                   bias=False)
        # self.video_fusion = fusions.Block(
        #     [v_input_size, v_input_size], v_input_size)
        self.video_fusion = nlpnn.WeightDropLinear(2 * v_input_size,
                                                   v_input_size,
                                                   weight_dropout=dropout_p,
                                                   bias=False)

        self.video_encoder = VideoEncoderRNN(v_input_size,
                                             hidden_size,
                                             input_dropout_p=dropout_p,
                                             dropout_p=dropout_p,
                                             n_layers=v_layers,
                                             bidirectional=v_bidirectional,
                                             rnn_cell=v_rnn_cell)

        self.transformer_encoder = SelfTransformerEncoder(
            hidden_size,
            tf_layers,
            dropout_p,
            vocab_size,
            q_max_len,
            v_max_len,
            embedding=s_embedding,
            position=True)

        # ! masked
        self.crossover_transformer = MaskedCrossoverTransformer(
            q_max_len=q_max_len,
            v_max_len=v_max_len,
            num_heads=8,
            num_layers=tf_layers,
            dropout=dropout_p)

        self.q_transformer = SelfTransformer(q_max_len,
                                             num_heads=8,
                                             num_layers=tf_layers,
                                             dropout=dropout_p,
                                             position=False)
        self.v_transformer = SelfTransformer(v_max_len,
                                             num_heads=8,
                                             num_layers=tf_layers,
                                             dropout=dropout_p,
                                             position=False)

        self.q_selfattn = SelfAttention(hidden_size,
                                        n_layers=tf_layers,
                                        dropout_p=dropout_p)
        self.v_selfattn = SelfAttention(hidden_size,
                                        n_layers=tf_layers,
                                        dropout_p=dropout_p)

        self.co_attn = CoAttention(hidden_size,
                                   n_layers=tf_layers,
                                   dropout_p=dropout_p)

        self.single_attn_semantic = SingleAttention(hidden_size,
                                                    n_layers=tf_layers,
                                                    dropout_p=dropout_p)

        self.single_attn_visual = SingleAttention(hidden_size,
                                                  n_layers=tf_layers,
                                                  dropout_p=dropout_p)

        self.co_concat_attn = CoConcatAttention(hidden_size,
                                                n_layers=tf_layers,
                                                dropout_p=dropout_p)

        self.co_siamese_attn = CoSiameseAttention(hidden_size,
                                                  n_layers=tf_layers,
                                                  dropout_p=dropout_p)

        self.crossover_fusion = fusions.Block([hidden_size, hidden_size],
                                              hidden_size,
                                              dropout_input=dropout_p)

        self.adj_learner = AdjLearner(hidden_size,
                                      hidden_size,
                                      dropout=dropout_p)
        # self.evo_adj_learner = EvoAdjLearner(
        #     hidden_size, hidden_size, dropout=dropout_p)

        self.gcn = GCN(hidden_size,
                       hidden_size,
                       hidden_size,
                       num_layers=gcn_layers,
                       dropout=dropout_p)
        self.gcn_atten_pool = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2), nn.Tanh(),
            nn.Linear(hidden_size // 2, 1), nn.Softmax(dim=-1))

        self.video_adj_learner = AdjLearner(v_input_size,
                                            v_input_size,
                                            dropout=dropout_p)
        self.video_gcn = GCN(v_input_size,
                             v_input_size,
                             v_input_size,
                             num_layers=1,
                             dropout=dropout_p)

        self.video_coattn = CoAttention(v_input_size,
                                        n_layers=1,
                                        dropout_p=dropout_p)

        self.global_fusion = fusions.Block([hidden_size, hidden_size],
                                           hidden_size,
                                           dropout_input=dropout_p)

        if answer_vocab_size is not None:
            self.fusion = fusions.Block([hidden_size, hidden_size],
                                        answer_vocab_size)
            self.fc_fusion = nn.Linear(hidden_size, answer_vocab_size)
        else:
            self.fusion = fusions.Block([hidden_size, hidden_size], 1)
            self.fc_fusion = nn.Linear(hidden_size, 1)