コード例 #1
0
 def forward(self, sequences, lengths):
     if lengths is None:
         raise "ERROR in this tail you need lengths of sequences."
     
     return feed_forward_rnn(self.lstm,
                             sequences,
                             lengths=lengths)
コード例 #2
0
    def forward(self, videoFeat, videoFeat_lengths, objects, objects_lengths,
                humans, humans_lengths, tokens, tokens_lengths, start, end,
                localiz):

        newVideoFeat, ObjectFeat, HumanFeat, attentionQHO, attentionQVH, attentionQVO = self.model_spatial(videoFeat, videoFeat_lengths, \
                           objects, objects_lengths, \
                           humans, humans_lengths, \
                           tokens, tokens_lengths)

        mask = self.get_mask_from_sequence_lengths(videoFeat_lengths,
                                                   int(videoFeat.shape[1]))

        videoFeat = newVideoFeat
        attention = self.multimodal_fc2(videoFeat).squeeze()
        if videoFeat.shape[0] == 1:
            attention = attention.unsqueeze(0)
        rqrt_length = torch.rsqrt(tokens_lengths.float()).unsqueeze(1).repeat(
            1, attention.shape[1])
        attention = attention * rqrt_length
        attention = self.mask_softmax(attention, mask)

        output, _ = feed_forward_rnn(self.rnn_localization,
                                     videoFeat,
                                     lengths=videoFeat_lengths)

        pred_start = self.starting(output.view(-1, output.size(2))).view(
            -1, output.size(1), 1).squeeze()
        pred_start = self.mask_softmax(pred_start, mask)

        pred_end = self.ending(output.view(-1, output.size(2))).view(
            -1, output.size(1), 1).squeeze()
        pred_end = self.mask_softmax(pred_end, mask)

        start_loss, individual_start_loss = self.kl_div(
            pred_start, start, videoFeat_lengths)
        end_loss, individual_end_loss = self.kl_div(pred_end, end,
                                                    videoFeat_lengths)

        individual_loss = individual_start_loss + individual_end_loss

        atten_loss = torch.sum(
            -((1 - localiz) * torch.log((1 - attention) + 1E-12)), dim=1)
        atten_loss = torch.mean(atten_loss)

        if self.cfg.ATTENTION_LOSS:
            total_loss = start_loss + end_loss + atten_loss
        else:
            total_loss = start_loss + end_loss

        return total_loss, individual_loss, pred_start, pred_end, attention, atten_loss, attentionQHO, attentionQVH, attentionQVO
コード例 #3
0
    def forward(self, videoFeat, videoFeat_lengths, tokens, tokens_lengths,
                start, end, localiz):

        mask = self.get_mask_from_sequence_lengths(videoFeat_lengths,
                                                   int(videoFeat.shape[1]))

        filter_start, lengths = self.model_df(tokens, tokens_lengths)

        videoFeat = self.reduction(videoFeat)

        attention = self.attention(videoFeat, filter_start, lengths)
        rqrt_length = torch.rsqrt(lengths.float()).unsqueeze(1).repeat(
            1, attention.shape[1])
        attention = attention * rqrt_length

        attention = self.mask_softmax(attention, mask)

        videoFeat_hat = attention.unsqueeze(2).repeat(
            1, 1, self.cfg.REDUCTION.OUTPUT_SIZE) * videoFeat

        output, _ = feed_forward_rnn(self.rnn_localization,
                                     videoFeat_hat,
                                     lengths=videoFeat_lengths)

        pred_start = self.starting(output.view(-1, output.size(2))).view(
            -1, output.size(1), 1).squeeze()
        pred_start = self.mask_softmax(pred_start, mask)

        pred_end = self.ending(output.view(-1, output.size(2))).view(
            -1, output.size(1), 1).squeeze()
        pred_end = self.mask_softmax(pred_end, mask)

        start_loss, individual_start_loss = self.kl_div(
            pred_start, start, videoFeat_lengths)
        end_loss, individual_end_loss = self.kl_div(pred_end, end,
                                                    videoFeat_lengths)

        individual_loss = individual_start_loss + individual_end_loss

        atten_loss = torch.sum(
            -((1 - localiz) * torch.log((1 - attention) + 1E-12)), dim=1)
        atten_loss = torch.mean(atten_loss)

        if True:
            total_loss = start_loss + end_loss + atten_loss
        else:
            total_loss = start_loss + end_loss

        return total_loss, individual_loss, pred_start, pred_end, attention, atten_loss
コード例 #4
0
    def forward(self, videoFeat, videoFeat_lengths, tokens, tokens_lengths, start, end, localiz, frame_start, frame_end):

        mask = self.get_mask_from_sequence_lengths(videoFeat_lengths, int(videoFeat.shape[1]))

        output_video = self.model_video_GRU(videoFeat,videoFeat_lengths,mask)
        filter_start, lengths = self.model_df(tokens, tokens_lengths,output_video)
        # output_video =  self.feature_gauss_normalize(output_video)
        # filter_start = self.feature_gauss_normalize(filter_start)
        # attention_weights = attention_weights.detach().cpu().numpy()
        # np.save('/home/thy/disk/proposal_free/experiments/visualization/attention.npy',attention_weights)

        output = self.fusion_layer(filter_start,output_video,self.cfg.ACRM_CLASSIFICATION.FUSION)
        # output = torch.cat([filter_start.unsqueeze(dim=1).repeat(1,output_video.shape[1],1),output_video],dim=-1)
        # output = filter_start.unsqueeze(dim=1).repeat(1,output_video.shape[1],1) * output_video
        if self.is_use_rnn_loc == True:
            output, _ = feed_forward_rnn(self.rnn_localization,
                            output,
                            lengths=videoFeat_lengths)
            output = self.dropout_layer(output)
        pred_start = self.starting(output.view(-1, output.size(2))).view(-1,output.size(1),1).squeeze()
        pred_start = self.mask_softmax(pred_start, mask)

        pred_end = self.ending(output.view(-1, output.size(2))).view(-1,output.size(1),1).squeeze()
        pred_end = self.mask_softmax(pred_end, mask)

        pred_inter = self.intering(output.view(-1, output.size(2))).view(-1,output.size(1),1).squeeze()
        pred_inter = self.mask_softmax(pred_inter, mask)

        start_loss, individual_start_loss = self.max_boundary(pred_start, frame_start, videoFeat_lengths)
        end_loss, individual_end_loss     = self.max_boundary(pred_end, frame_end, videoFeat_lengths)
        inter_loss, individual_inter_loss = self.max_inter(pred_inter,frame_start,frame_end,videoFeat_lengths)

        individual_loss = individual_start_loss + individual_end_loss + individual_inter_loss
        atten_loss = torch.tensor(0).cuda()
        # atten_loss = torch.sum(-( (1-localiz) * torch.log((1-attention) + 1E-12)), dim=1)
        # atten_loss = torch.mean(atten_loss)
        attention = output_video[:,:,0]
        if True:
            # total_loss = start_loss + end_loss + atten_loss
            total_loss = start_loss + end_loss + 1*inter_loss
        else:
            total_loss = start_loss + end_loss

        return total_loss, individual_loss, pred_start, pred_end, attention, atten_loss
コード例 #5
0
    def forward(self, videoFeat, videoFeat_lengths, tokens, tokens_lengths, start, end, localiz, frame_start, frame_end):
        
        ##video_mask: you video de difang wei 1; else wei 0.
        mask = self.get_mask_from_sequence_lengths(videoFeat_lengths, int(videoFeat.shape[1]))
        ##sentence rnn +pooling + mlp
        filter_start, lengths = self.model_df(tokens, tokens_lengths, videoFeat)
        ##video linear
        videoFeat   = self.reduction(videoFeat)
        ##video he sentence bmm
        attention = self.attention(videoFeat, filter_start, lengths)
        ##1/sqrt(d)
        rqrt_length = torch.rsqrt(lengths.float()).unsqueeze(1).repeat(1, attention.shape[1])
        attention = attention * rqrt_length
        ##softmax attention
        attention = self.mask_softmax(attention, mask)
        ##video and attention hadamard product 
        videoFeat_hat = attention.unsqueeze(2).repeat(1,1,self.cfg.REDUCTION.OUTPUT_SIZE) * videoFeat
        #predict GRU
        output, _ = feed_forward_rnn(self.rnn_localization,
                        videoFeat_hat,
                        lengths=videoFeat_lengths)
        
        ##predict fully connect
        pred_start = self.starting(output.view(-1, output.size(2))).view(-1,output.size(1),1).squeeze()
        pred_start = self.mask_softmax(pred_start, mask)

        pred_end = self.ending(output.view(-1, output.size(2))).view(-1,output.size(1),1).squeeze()
        pred_end = self.mask_softmax(pred_end, mask)

        start_loss, individual_start_loss = self.kl_div(pred_start, start, videoFeat_lengths)
        end_loss, individual_end_loss     = self.kl_div(pred_end, end, videoFeat_lengths)

        individual_loss = individual_start_loss + individual_end_loss

        atten_loss = torch.sum(-( (1-localiz) * torch.log((1-attention) + 1E-12)), dim=1)
        atten_loss = torch.mean(atten_loss)

        if True:
            total_loss = start_loss + end_loss + atten_loss
        else:
            total_loss = start_loss + end_loss

        return total_loss, individual_loss, pred_start, pred_end, attention, atten_loss