def load_part_model(self, action_model_path=None, rnn_path=None):

        # load action net
        if action_model_path != None:

            act_data = torch.load(action_model_path)
            # act_data = torch.load('./action_net_model.pwf')

            ## to remove module
            new_state_dict = OrderedDict()
            for k, v in act_data.items():
                # if k.find('module') != -1 :
                name = k[7:]  # remove `module.`
                new_state_dict[name] = v

            act_net = ACT_net(self.classes, self.sample_duration)

            act_net.create_architecture()
            act_net.load_state_dict(new_state_dict)
            self.act_net = act_net

        else:
            self.act_net = ACT_net(self.classes, self.sample_duration)
            self.act_net.create_architecture()

        # load lstm
        if rnn_path != None:

            act_rnn = Act_RNN(self.p_feat_size, int(self.p_feat_size / 2),
                              self.n_classes)

            act_rnn_data = torch.load(rnn_path)
            act_rnn.load(act_rnn_data)
            self.act_rnn = act_rnn

        else:
            self.act_rnn = Act_RNN(self.p_feat_size, int(self.p_feat_size),
                                   self.n_classes)
        video_path=dataset_frames,
        frames_dur=sample_duration,
        spatial_transform=spatial_transform,
        temporal_transform=temporal_transform,
        bboxes_file=boxes_file,
        split_txt_path=split_txt_path,
        mode='train',
        classes_idx=cls2idx)
    train_data_loader = torch.utils.data.DataLoader(train_data,
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    num_workers=2,
                                                    pin_memory=True)

    # Init action_net
    act_model = ACT_net(actions, sample_duration, device=device)

    act_model.create_architecture(model_path=model_path)

    if torch.cuda.device_count() > 1:
        print('Using {} GPUs!'.format(torch.cuda.device_count()))
        act_model = nn.DataParallel(act_model)

    act_model.to(device)

    lr = 0.1
    lr_decay_step = 10
    lr_decay_gamma = 0.1

    params = []
Beispiel #3
0
    cls2idx = {actions[i]: i for i in range(0, len(actions))}

    ### get videos id

    spatial_transform = Compose([
        Scale(sample_size),  # [Resize(sample_size),
        ToTensor(),
        Normalize(mean, [1, 1, 1])
    ])
    temporal_transform = LoopPadding(sample_duration)

    n_classes = len(actions)

    # Init action_net

    model = ACT_net(actions, sample_duration)
    model.create_architecture()
    model = nn.DataParallel(model)
    model.to(device)

    model_data = torch.load('./action_net_model_both_without_avg.pwf')

    # model.load_state_dict(model_data)
    model.eval()

    data = Video_Dataset_small_clip(dataset_frames,
                                    frames_dur=sample_duration,
                                    spatial_transform=spatial_transform,
                                    temporal_transform=temporal_transform,
                                    bboxes_file=boxes_file,
                                    split_txt_path=split_txt_path,
Beispiel #4
0
        'LongJump', 'PoleVault', 'RopeClimbing', 'SalsaSpin', 'SkateBoarding',
        'Skiing', 'Skijet', 'SoccerJuggling', 'Surfing', 'TennisSwing',
        'TrampolineJumping', 'VolleyballSpiking', 'WalkingWithDog'
    ]

    cls2idx = {actions[i]: i for i in range(0, len(actions))}

    spatial_transform = Compose([
        Scale(sample_size),  # [Resize(sample_size),
        ToTensor(),
        Normalize(mean, [1, 1, 1])
    ])
    temporal_transform = LoopPadding(sample_duration)

    # Init action_net
    model = ACT_net(actions, sample_duration)
    model.create_architecture()
    model = nn.DataParallel(model)
    model.to(device)

    # model_data = torch.load('./actio_net_model_both.pwf')
    # model_data = torch.load('./action_net_model_both_without_avg.pwf')
    # model_data = torch.load('./action_net_model_16frm_64.pwf')
    # model_data = torch.load('./action_net_model_both_sgl_frm.pwf')
    model_data = torch.load('./action_net_model_both.pwf')
    #
    # model_data = torch.load('./action_net_model_part1_1_8frm.pwf')
    model.load_state_dict(model_data)

    # model_data = torch.load('./region_net_8frm.pwf')
    # model.module.act_rpn.load_state_dict(model_data)
Beispiel #5
0
        Normalize(mean, [1, 1, 1])
    ])
    temporal_transform = LoopPadding(sample_duration)

    n_classes = len(actions)

    #######################################################
    #          Part 1-1 - train nTPN - without reg         #
    #######################################################

    print(' -----------------------------------------------------')
    print('|          Part 1-1 - train TPN - without reg         |')
    print(' -----------------------------------------------------')

    # Init action_net
    act_model = ACT_net(actions, sample_duration)
    act_model.create_architecture()
    if torch.cuda.device_count() > 1:
        print('Using {} GPUs!'.format(torch.cuda.device_count()))

    act_model = nn.DataParallel(act_model)

    act_model.to(device)

    lr = 0.1
    lr_decay_step = 10
    lr_decay_gamma = 0.1

    params = []

    # for p in act_model.module.reg_layer.parameters() : p.requires_grad=False
                                 Normalize(mean, [1, 1, 1])])
    temporal_transform = LoopPadding(sample_duration)

    n_classes = len(actions)


    #######################################################
    #          Part 1-1 - train nTPN - without reg         #
    #######################################################

    print(' -----------------------------------------------------')
    print('|          Part 1-1 - train TPN - without reg         |')
    print(' -----------------------------------------------------')

    # Init action_net
    act_model = ACT_net(actions, sample_duration)
    act_model.create_architecture()
    if torch.cuda.device_count() > 1:
        print('Using {} GPUs!'.format(torch.cuda.device_count()))

    act_model = nn.DataParallel(act_model)
    act_model.to(device)

    model_data = torch.load('./action_net_model_16frm.pwf')
    act_model.load_state_dict(model_data)


    # lr = 0.1
    lr = 0.00001
    lr_decay_step = 10
    lr_decay_gamma = 0.1
                                 Normalize(mean, [1, 1, 1])])
    temporal_transform = LoopPadding(sample_duration)

    n_classes = len(actions)


    #######################################################
    #          Part 1-1 - train nTPN - without reg         #
    #######################################################

    print(' -----------------------------------------------------')
    print('|          Part 1-1 - train TPN - without reg         |')
    print(' -----------------------------------------------------')

    # Init action_net
    act_model = ACT_net(actions, sample_duration)
    act_model.create_architecture()
    if torch.cuda.device_count() > 1:
        print('Using {} GPUs!'.format(torch.cuda.device_count()))

    act_model = nn.DataParallel(act_model)

    act_model.to(device)

    lr = 0.1
    lr_decay_step = 10
    lr_decay_gamma = 0.1
    
    params = []

    # for p in act_model.module.reg_layer.parameters() : p.requires_grad=False
class Model(nn.Module):
    """ 
    action localizatio network which contains:
    -ACT_net : a network for proposing action tubes for 16 frames
    -TCN net : a dilation network which classifies the input tubes
    """
    def __init__(self, actions, sample_duration, sample_size):
        super(Model, self).__init__()

        self.classes = actions
        self.n_classes = len(actions)

        # self.act_net = ACT_net(actions,sample_duration)

        ## general options
        self.sample_duration = sample_duration
        self.sample_size = sample_size
        self.step = int(self.sample_duration / 2)
        self.p_feat_size = 64  # 128 # 256 # 512

        # For connection
        self.max_num_tubes = conf.MAX_NUMBER_TUBES
        self.connection_thresh = conf.CONNECTION_THRESH
        self.update_thresh_step = conf.UPDATE_THRESH
        self.calc = Calculator(self.max_num_tubes, self.update_thresh_step,
                               self.connection_thresh)

    def forward(self, n_devs, dataset_folder, vid_names, clips, vid_id, boxes,
                mode, cls2idx, num_actions, num_frames, h_, w_):
        '''
        TODO describe procedure
        '''

        # print('boxes.shape :',boxes.shape)

        ## define a dataloader for the whole video
        # print('----------Inside----------')
        # print('num_frames :',num_frames)
        # print('clips.shape :',clips.shape)

        clips = clips.squeeze(0)
        clips = clips[:num_frames]

        print('num_frames :', num_frames)
        print('clips.shape :', clips.shape)

        if self.training:
            boxes = boxes.squeeze(0).permute(1, 0, 2).cpu()
            boxes = boxes[:num_frames, :num_actions]

        batch_size = 2  #
        # batch_size = 16 #

        num_images = 1
        rois_per_image = int(cfg.TRAIN.BATCH_SIZE /
                             num_images) if self.training else 150

        data = single_video(dataset_folder,
                            h_,
                            w_,
                            vid_names,
                            vid_id,
                            frames_dur=self.sample_duration,
                            sample_size=self.sample_size,
                            classes_idx=cls2idx,
                            n_frames=num_frames)

        data_loader = torch.utils.data.DataLoader(
            data,
            batch_size=batch_size,
            pin_memory=False,  # num_workers=num_workers, pin_memory=True,
            # shuffle=False, num_workers=8)
            shuffle=False)

        n_clips = data.__len__()

        features = torch.zeros(n_clips, rois_per_image, self.p_feat_size,
                               self.sample_duration)
        p_tubes = torch.zeros(n_clips, rois_per_image, self.sample_duration *
                              4)  # all the proposed tube-rois
        actioness_score = torch.zeros(n_clips, rois_per_image)
        overlaps_scores = torch.zeros(n_clips, rois_per_image, rois_per_image)

        f_tubes = []

        if self.training:

            f_gt_tubes = torch.zeros(n_clips, num_actions,
                                     self.sample_duration * 4)  # gt_tubes
            tubes_labels = torch.zeros(n_clips, rois_per_image)  # tubes rois
            loops = int(np.ceil(n_clips / batch_size))
            labels = torch.zeros(num_actions)

            for i in range(num_actions):
                idx = boxes[:, i, 4].nonzero().view(-1)
                labels[i] = boxes[i, idx[0], 4]

        for step, dt in enumerate(data_loader):

            # if step == 1:
            #     break
            print('\tstep :', step)

            frame_indices, im_info, start_fr = dt
            clips_ = clips[frame_indices].cuda()

            if self.training:
                boxes_ = boxes[frame_indices].cuda()
                box_ = boxes_.permute(0, 2, 1,
                                      3).float().contiguous()[:, :, :, :-1]
            else:
                box_ = None

            im_info = im_info.cuda()
            start_fr = start_fr.cuda()

            with torch.no_grad():
                tubes, pooled_feat, \
                rpn_loss_cls,  rpn_loss_bbox, \
                _,_, rois_label, \
                sgl_rois_bbox_pred, sgl_rois_bbox_loss = self.act_net(clips_.permute(0,2,1,3,4),
                                                            im_info,
                                                            None,
                                                            box_,
                                                            start_fr)

            pooled_feat = pooled_feat.view(-1, rois_per_image,
                                           self.p_feat_size,
                                           self.sample_duration)

            indexes_ = (torch.arange(0, tubes.size(0)) *
                        int(self.sample_duration / 2) +
                        start_fr[0].cpu()).unsqueeze(1)
            indexes_ = indexes_.expand(tubes.size(0),
                                       tubes.size(1)).type_as(tubes)

            idx_s = step * batch_size
            idx_e = step * batch_size + batch_size

            features[idx_s:idx_e] = pooled_feat
            p_tubes[idx_s:idx_e, ] = tubes[:, :, 1:-1]
            actioness_score[idx_s:idx_e] = tubes[:, :, -1]

            if self.training:

                box = boxes_.permute(0, 2, 1, 3).contiguous()[:, :, :, :-2]
                box = box.contiguous().view(box.size(0), box.size(1), -1)

                f_gt_tubes[idx_s:idx_e] = box
                tubes_labels[idx_s:idx_e] = rois_label.squeeze(-1).type_as(
                    tubes_labels)

        ########################################################
        #          Calculate overlaps and connections          #
        ########################################################

        overlaps_scores = torch.zeros(n_clips, rois_per_image,
                                      rois_per_image).type_as(overlaps_scores)

        for i in range(n_clips - 1):
            overlaps_scores[i] = tube_overlaps(
                p_tubes[i, :, int(self.sample_duration * 4 / 2):],
                p_tubes[i + 1, :, :int(self.sample_duration * 4 / 2)])

        if n_clips > 1:
            final_scores, final_poss = self.calc(
                overlaps_scores.cuda(), actioness_score.cuda(),
                torch.Tensor([n_clips]), torch.Tensor([rois_per_image]))
        else:
            offset = torch.arange(rois_per_image).float()
            final_poss = torch.stack([torch.zeros((rois_per_image)), offset],
                                     dim=1).unsqueeze(1).long()

        ## Now connect the tubes
        final_tubes = torch.zeros(final_poss.size(0), num_frames, 4)
        f_tubes = []
        for i in range(final_poss.size(0)):
            tub = []
            for j in range(final_poss.size(1)):

                curr_ = final_poss[i, j]
                start_fr = curr_[0] * int(self.sample_duration / 2)
                end_fr = min((curr_[0] * int(self.sample_duration / 2) +
                              self.sample_duration).type_as(num_frames),
                             num_frames).type_as(start_fr)

                if curr_[0] == -1:
                    break

                curr_frames = p_tubes[curr_[0], curr_[1]]
                tub.append((curr_[0].item(), curr_[1].item()))
                ## TODO change with avg
                final_tubes[i, start_fr:end_fr] = torch.max(
                    curr_frames.view(-1, 4).contiguous()[:(end_fr -
                                                           start_fr).long()],
                    final_tubes[i, start_fr:end_fr].type_as(curr_frames))
            f_tubes.append(tub)

        ###################################################
        #          Choose gth Tubes for RCNN\TCN          #
        ###################################################
        if self.training:

            # # get gt tubes and feats
            ##  calculate overlaps
            boxes_ = boxes.permute(1, 0, 2).contiguous()
            boxes_ = boxes_[:, :, :4].contiguous().view(num_actions, -1)

            overlaps = tube_overlaps(final_tubes.view(-1, num_frames * 4),
                                     boxes_.type_as(final_tubes))
            max_overlaps, _ = torch.max(overlaps, 1)
            max_overlaps = max_overlaps.clamp_(min=0)
            ## TODO change numbers
            bg_tubes_indices = max_overlaps.lt(0.3).nonzero()
            bg_tubes_indices_picked = (torch.rand(5) *
                                       bg_tubes_indices.size(0)).long()
            bg_tubes_list = [
                f_tubes[i] for i in bg_tubes_indices[bg_tubes_indices_picked]
            ]
            bg_labels = torch.zeros(len(bg_tubes_list))

            gt_tubes_list = [[] for i in range(num_actions)]

            for i in range(n_clips):

                overlaps = tube_overlaps(p_tubes[i], f_gt_tubes[i])
                max_overlaps, argmax_overlaps = torch.max(overlaps, 0)

                for j in range(num_actions):
                    if max_overlaps[j] == 1.0:
                        gt_tubes_list[j].append((i, j))

            ## concate fb, bg tubes
            f_tubes = gt_tubes_list + bg_tubes_list
            target_lbl = torch.cat([labels, bg_labels], dim=0)

        ##############################################

        if len(f_tubes) == 0:
            print('------------------')
            print('    empty tube    ')
            return torch.Tensor([]).cuda(), torch.Tensor([]).cuda(), None
        max_seq = reduce(lambda x, y: y if len(y) > len(x) else x, f_tubes)
        max_length = len(max_seq)

        ## calculate input rois
        ## f_feats.shape : [#f_tubes, max_length, 512]
        final_video_tubes = torch.zeros(len(f_tubes), 6).cuda()
        prob_out = torch.zeros(len(f_tubes), self.n_classes).cuda()

        for i in range(len(f_tubes)):

            seq = f_tubes[i]
            tmp_tube = torch.Tensor(len(seq), 6)
            feats = torch.Tensor(len(seq), self.p_feat_size)

            for j in range(len(seq)):

                feats[j] = features[seq[j][0], seq[j][1]].mean(1)
                tmp_tube[j] = p_tubes[seq[j]][1:7]

            prob_out[i] = self.act_rnn(feats.cuda())
            if prob_out[i, 0] != prob_out[i, 0]:
                print('tmp_tube :',tmp_tube, ' prob_out :', prob_out ,' feats :',feats.cpu().numpy(), ' numpy(), feats.shape  :,', feats.shape ,' target_lbl :',target_lbl, \
                      ' \ntmp_tube :',tmp_tube, )
                exit(-1)

        # ##########################################
        # #           Time for Linear Loss         #
        # ##########################################

        cls_loss = torch.Tensor([0]).cuda()

        final_tubes = final_tubes.type_as(final_poss)
        # # classification probability
        if self.training:
            cls_loss = F.cross_entropy(prob_out.cpu(),
                                       target_lbl.long()).cuda()

        if self.training:
            return final_tubes, prob_out, cls_loss,
        else:
            return final_tubes, prob_out, None

    def deactivate_action_net_grad(self):

        for p in self.act_net.parameters():
            p.requires_grad = False
        # self.act_net.eval()
        # for key, value in dict(self.named_parameters()).items():
        #     print(key, value.requires_grad)

    def load_part_model(self, action_model_path=None, rnn_path=None):

        # load action net
        if action_model_path != None:

            act_data = torch.load(action_model_path)
            # act_data = torch.load('./action_net_model.pwf')

            ## to remove module
            new_state_dict = OrderedDict()
            for k, v in act_data.items():
                # if k.find('module') != -1 :
                name = k[7:]  # remove `module.`
                new_state_dict[name] = v

            act_net = ACT_net(self.classes, self.sample_duration)

            act_net.create_architecture()
            act_net.load_state_dict(new_state_dict)
            self.act_net = act_net

        else:
            self.act_net = ACT_net(self.classes, self.sample_duration)
            self.act_net.create_architecture()

        # load lstm
        if rnn_path != None:

            act_rnn = Act_RNN(self.p_feat_size, int(self.p_feat_size / 2),
                              self.n_classes)

            act_rnn_data = torch.load(rnn_path)
            act_rnn.load(act_rnn_data)
            self.act_rnn = act_rnn

        else:
            self.act_rnn = Act_RNN(self.p_feat_size, int(self.p_feat_size),
                                   self.n_classes)
    def load_part_model(self,
                        resnet_path=None,
                        action_model_path=None,
                        rnn_path=None):

        # load action net
        if action_model_path != None:

            act_data = torch.load(action_model_path)
            # act_data = torch.load('./action_net_model.pwf')

            ## to remove module
            new_state_dict = OrderedDict()
            for k, v in act_data.items():
                # if k.find('module') != -1 :
                name = k[7:]  # remove `module.`
                new_state_dict[name] = v

            act_net = ACT_net(self.classes, self.sample_duration)
            if resnet_path is not None:
                act_net.create_architecture(model_path=resnet_path)
            else:
                act_net.create_architecture()
            act_net.load_state_dict(new_state_dict)
            self.act_net = act_net

        else:
            self.act_net = ACT_net(self.classes, self.sample_duration)
            if resnet_path is not None:
                self.act_net.create_architecture(model_path=resnet_path)
            else:
                self.act_net.create_architecture()

        # load lstm
        if rnn_path != None:

            # act_rnn = Act_RNN(self.p_feat_size,int(self.p_feat_size/2),self.n_classes)
            # act_rnn_data = torch.load(rnn_path)
            # act_rnn.load_state_dict(act_rnn_data)

            act_rnn = nn.Sequential(
                # nn.Linear(64*self.sample_duration, 256),
                # nn.ReLU(True),
                # nn.Dropout(0.8),
                # nn.Linear(256,self.n_classes),
                nn.Linear(64 * self.sample_duration, self.n_classes),
                # nn.ReLU(True),
                # nn.Dropout(0.8),
                # nn.Linear(256,self.n_classes),
            )
            act_rnn_data = torch.load(rnn_path)
            act_rnn.load_state_dict(act_rnn_data)
            self.act_rnn = act_rnn

        else:
            # self.act_rnn =Act_RNN(self.p_feat_size,int(self.p_feat_size/2),self.n_classes)
            self.act_rnn = nn.Sequential(
                # nn.Linear(64*self.sample_duration, 256),
                # nn.ReLU(True),
                # nn.Dropout(0.8),
                # nn.Linear(256,self.n_classes),
                nn.Linear(64 * self.sample_duration, self.n_classes),
                # nn.ReLU(True),
                # nn.Dropout(0.8),
                # nn.Linear(256,self.n_classes),
            )
            for m in self.act_rnn.modules():
                if m == nn.Linear:
                    m.weight.data.normal_().fmod_(2).mul_(stddev).add_(
                        mean)  # not a perfect approximation
class Model(nn.Module):
    """ 
    action localizatio network which contains:
    -ACT_net : a network for proposing action tubes for 16 frames
    -TCN net : a dilation network which classifies the input tubes
    """
    def __init__(self, actions, sample_duration, sample_size):
        super(Model, self).__init__()

        self.classes = actions
        self.n_classes = len(actions)

        # self.act_net = ACT_net(actions,sample_duration)

        ## general options
        self.sample_duration = sample_duration
        self.sample_size = sample_size
        self.step = int(self.sample_duration / 2)
        self.p_feat_size = 64  # 128 # 256 # 512

        # For connection
        self.max_num_tubes = conf.MAX_NUMBER_TUBES
        self.connection_thresh = conf.CONNECTION_THRESH
        self.update_thresh = conf.UPDATE_THRESH
        self.calc = Calculator(self.max_num_tubes, self.update_thresh,
                               self.connection_thresh)

    def forward(self, n_devs, dataset_folder, vid_names, clips, vid_id, boxes,
                mode, cls2idx, num_actions, num_frames, h_, w_):
        '''
        TODO describe procedure
        '''

        # print('boxes.shape :',boxes.shape)

        ## define a dataloader for the whole video
        # print('----------Inside----------')
        # print('num_frames :',num_frames)
        # print('clips.shape :',clips.shape)

        clips = clips.squeeze(0)
        ret_n_frames = clips.size(0)
        clips = clips[:num_frames]

        # print('num_frames :',num_frames)
        # print('clips.shape :',clips.shape)
        # exit(-1)
        if self.training:
            boxes = boxes.squeeze(0).permute(1, 0, 2).cpu()
            boxes = boxes[:num_frames, :num_actions].clamp_(min=0)

        batch_size = 4  #
        # batch_size = 2 #
        # batch_size = 16 #

        num_images = 1
        rois_per_image = int(conf.TRAIN.BATCH_SIZE /
                             num_images) if self.training else 150

        data = single_video(dataset_folder,
                            h_,
                            w_,
                            vid_names,
                            vid_id,
                            frames_dur=self.sample_duration,
                            sample_size=self.sample_size,
                            classes_idx=cls2idx,
                            n_frames=num_frames)

        data_loader = torch.utils.data.DataLoader(
            data,
            batch_size=batch_size,
            pin_memory=False,  # num_workers=num_workers, pin_memory=True,
            # shuffle=False, num_workers=8)
            shuffle=False)

        n_clips = data.__len__()

        features = torch.zeros(n_clips, rois_per_image, self.p_feat_size,
                               self.sample_duration).type_as(clips)
        p_tubes = torch.zeros(n_clips, rois_per_image, self.sample_duration *
                              4).type_as(clips)  # all the proposed tube-rois
        actioness_score = torch.zeros(n_clips, rois_per_image).type_as(clips)
        overlaps_scores = torch.zeros(n_clips, rois_per_image,
                                      rois_per_image).type_as(clips)

        f_tubes = []

        # #
        # overlaps_scores = torch.zeros(n_clips, rois_per_image, rois_per_image).type_as(overlaps_scores)

        if self.training:

            f_gt_tubes = torch.zeros(n_clips, num_actions,
                                     self.sample_duration * 4)  # gt_tubes
            tubes_labels = torch.zeros(n_clips, rois_per_image)  # tubes rois
            loops = int(np.ceil(n_clips / batch_size))
            labels = torch.zeros(num_actions)

            for i in range(num_actions):
                idx = boxes[:, i, 4].nonzero().view(-1)
                labels[i] = boxes[idx[0], i, 4]

        ## Init connect thresh
        self.calc.thresh = self.connection_thresh

        for step, dt in enumerate(data_loader):

            frame_indices, im_info, start_fr = dt
            clips_ = clips[frame_indices].cuda()

            if self.training:
                boxes_ = boxes[frame_indices].cuda()
                box_ = boxes_.permute(0, 2, 1,
                                      3).float().contiguous()[:, :, :, :-1]
            else:
                box_ = None

            im_info = im_info.cuda()
            start_fr = start_fr.cuda()

            with torch.no_grad():
                tubes, pooled_feat, \
                rpn_loss_cls,  rpn_loss_bbox, \
                _,_, rois_label, \
                sgl_rois_bbox_pred, sgl_rois_bbox_loss = self.act_net(clips_.permute(0,2,1,3,4),
                                                            im_info,
                                                            None,
                                                            box_,
                                                            start_fr)
            pooled_feat = pooled_feat.mean(-1).mean(-1)
            pooled_feat = pooled_feat.view(-1, rois_per_image,
                                           self.p_feat_size,
                                           self.sample_duration)

            # regression
            n_tubes = len(tubes)
            if not self.training:
                tubes = tubes.view(-1, self.sample_duration * 4 + 2)
                tubes[:,1:-1] = tube_transform_inv(tubes[:,1:-1],\
                                               sgl_rois_bbox_pred.view(-1,self.sample_duration*4),(1.0,1.0,1.0,1.0))
                tubes = tubes.view(n_tubes, rois_per_image,
                                   self.sample_duration * 4 + 2)
                tubes[:, :, 1:-1] = clip_boxes(tubes[:, :, 1:-1], im_info,
                                               tubes.size(0))

            indexes_ = (torch.arange(0, tubes.size(0)) *
                        int(self.sample_duration / 2) +
                        start_fr[0].cpu()).unsqueeze(1)
            indexes_ = indexes_.expand(tubes.size(0),
                                       tubes.size(1)).type_as(tubes)

            idx_s = step * batch_size
            idx_e = min(step * batch_size + batch_size, n_clips)

            features[idx_s:idx_e] = pooled_feat
            p_tubes[idx_s:idx_e, ] = tubes[:, :, 1:-1]
            actioness_score[idx_s:idx_e] = tubes[:, :, -1]

            if self.training:

                box = boxes_.permute(0, 2, 1, 3).contiguous()[:, :, :, :-2]
                box = box.contiguous().view(box.size(0), box.size(1), -1)

                f_gt_tubes[idx_s:idx_e] = box

            # connection algo
            for i in range(idx_s, idx_e):
                if i == 0:

                    # Init tensors for connecting
                    offset = torch.arange(0, rois_per_image).int().cuda()
                    ones_t = torch.ones(rois_per_image).int().cuda()
                    zeros_t = torch.zeros(rois_per_image, n_clips,
                                          2).int().cuda() - 1

                    pos = torch.zeros(rois_per_image, n_clips,
                                      2).int().cuda() - 1  # initial pos
                    pos[:, 0, 0] = 0
                    pos[:, 0, 1] = offset.contiguous(
                    )  # contains the current tubes to be connected
                    pos_indices = torch.zeros(rois_per_image).int().cuda(
                    )  # contains the pos of the last element of the previous tensor
                    actioness_scr = actioness_score[0].float().cuda(
                    )  # actioness sum of active tubes
                    overlaps_scr = torch.zeros(rois_per_image).float().cuda(
                    )  # overlaps  sum of active tubes
                    final_scores = torch.Tensor().float().cuda(
                    )  # final scores
                    final_poss = torch.Tensor().int().cuda()  # final tubes

                    continue

                overlaps_ = tube_overlaps(
                    p_tubes[i - 1, :,
                            int(self.sample_duration * 4 / 2):],
                    p_tubes[i, :, :int(self.sample_duration * 4 /
                                       2)]).type_as(p_tubes)

                pos, pos_indices, \
                f_scores, actioness_scr, \
                overlaps_scr = self.calc(torch.Tensor([n_clips]),torch.Tensor([rois_per_image]),torch.Tensor([pos.size(0)]),
                                         pos, pos_indices, actioness_scr, overlaps_scr,
                                         overlaps_, actioness_score[i], torch.Tensor([i]))

                if pos.size(0) > self.update_thresh:

                    final_scores, final_poss, pos , pos_indices, \
                    actioness_scr, overlaps_scr,  f_scores = self.calc.update_scores(final_scores,final_poss, f_scores, pos, pos_indices, actioness_scr, overlaps_scr)

                if f_scores.dim() == 0:
                    f_scores = f_scores.unsqueeze(0)
                    pos = pos.unsqueeze(0)
                    pos_indices = pos_indices.unsqueeze(0)
                    actioness_scr = actioness_scr.unsqueeze(0)
                    overlaps_scr = overlaps_scr.unsqueeze(0)
                if final_scores.dim() == 0:
                    final_scores = final_scores.unsqueeze(0)
                    final_poss = final_poss.unsqueeze(0)

                try:
                    final_scores = torch.cat((final_scores, f_scores))
                except:
                    print('final_scores :', final_scores)
                    print('final_scores.shape :', final_scores.shape)
                    print('final_scores.dim() :', final_scores.dim())
                    print('f_scores :', f_scores)
                    print('f_scores.shape :', f_scores.shape)
                    print('f_scores.dim() :', f_scores.dim())
                    exit(-1)
                try:
                    final_poss = torch.cat((final_poss, pos))
                except:
                    print('final_poss :', final_poss)
                    print('final_poss.shape :', final_poss.shape)
                    print('final_poss.dim() :', final_poss.dim())
                    print('pos :', pos)
                    print('pos.shape :', pos.shape)
                    print('pos.dim() :', pos.dim())
                    exit(-1)

                # add new tubes
                pos = torch.cat((pos, zeros_t))
                pos[-rois_per_image:, 0, 0] = ones_t * i
                pos[-rois_per_image:, 0, 1] = offset

                pos_indices = torch.cat(
                    (pos_indices, torch.zeros(
                        (rois_per_image)).type_as(pos_indices)))
                actioness_scr = torch.cat((actioness_scr, actioness_score[i]))
                overlaps_scr = torch.cat(
                    (overlaps_scr, torch.zeros(
                        (rois_per_image)).type_as(overlaps_scr)))

        ## add only last layers
        ## TODO check again
        indices = actioness_score[-1].ge(self.calc.thresh).nonzero().view(-1)
        if indices.nelement() > 0:
            zeros_t[:, 0, 0] = idx_e - 1
            zeros_t[:, 0, 1] = offset
            final_poss = torch.cat([final_poss, zeros_t[indices]])

        if pos.size(0) > self.update_thresh:
            print('Updating thresh...', final_scores.shape, final_poss.shape,
                  pos.shape, f_scores.shape, pos_indices.shape)
            final_scores, final_poss, pos , pos_indices, \
                actioness_scr, overlaps_scr,  f_scores = self.calc.update_scores(final_scores,final_poss, f_scores, pos, pos_indices, actioness_scr, overlaps_scr)
            print('Updating thresh...', final_scores.shape, final_poss.shape,
                  pos.shape, f_scores.shape, pos_indices.shape)

        final_tubes = torch.zeros(final_poss.size(0), num_frames, 4)

        f_tubes = []

        for i in range(final_poss.size(0)):
            tub = []
            for j in range(final_poss.size(1)):

                curr_ = final_poss[i, j]
                start_fr = curr_[0] * int(self.sample_duration / 2)
                end_fr = min((curr_[0] * int(self.sample_duration / 2) +
                              self.sample_duration).type_as(num_frames),
                             num_frames).type_as(start_fr)

                if curr_[0] == -1:
                    break

                curr_frames = p_tubes[curr_[0], curr_[1]]
                tub.append((curr_[0].item(), curr_[1].item()))
                ## TODO change with avg
                final_tubes[i, start_fr:end_fr] = torch.max(
                    curr_frames.view(-1, 4).contiguous()[:(end_fr -
                                                           start_fr).long()],
                    final_tubes[i, start_fr:end_fr].type_as(curr_frames))
            f_tubes.append(tub)

        ###################################################
        #          Choose gth Tubes for RCNN\TCN          #
        ###################################################
        if self.training:

            # # get gt tubes and feats
            ##  calculate overlaps

            boxes_ = boxes.permute(1, 0, 2).contiguous()
            boxes_ = boxes_[:, :, :4].contiguous().view(num_actions, -1)

            if final_tubes.nelement() == 0:

                print('problem final_tubes ...')
                print('boxes :', boxes.cpu().numpy())
                print('boxes_ :', boxes_)
                print('boxes_.shape :', boxes_.shape)
                print('final_tubes :', final_tubes)
                print('self.calc.thresh:', self.calc.thresh)
                print('final_scores :', final_scores.shape)
                print('final_pos.shape :', final_poss.shape)

            if final_tubes.nelement() > 0:
                overlaps = tube_overlaps(final_tubes.view(-1, num_frames * 4),
                                         boxes_.type_as(final_tubes))
                max_overlaps, _ = torch.max(overlaps, 1)
                max_overlaps = max_overlaps.clamp_(min=0)

                ## TODO change numbers
                bg_tubes_indices = max_overlaps.lt(0.3).nonzero()
                if bg_tubes_indices.nelement() > 0:
                    bg_tubes_indices_picked = (
                        torch.rand(9) * bg_tubes_indices.size(0)).long()
                    bg_tubes_list = [
                        f_tubes[i]
                        for i in bg_tubes_indices[bg_tubes_indices_picked]
                    ]
                    bg_labels = torch.zeros(len(bg_tubes_list))
                else:
                    bg_tubes_list = []
                    bg_labels = torch.Tensor([])
            else:
                bg_tubes_list = []
                bg_labels = torch.Tensor([])

            gt_tubes_list = [[] for i in range(num_actions)]

            # print('n_clips :',n_clips)

            for i in range(n_clips):
                # print('i :',i)
                # print('p_tubes.shape :',p_tubes.shape)
                # print('f_gt_tubes.shape :',f_gt_tubes.shape)
                # print('p_tubes.shape :',p_tubes[i])
                # print('f_gt_tubes.shape :',f_gt_tubes[i])

                overlaps = tube_overlaps(p_tubes[i],
                                         f_gt_tubes[i].type_as(p_tubes))
                # print('overlaps :',overlaps)
                max_overlaps, argmax_overlaps = torch.max(overlaps, 0)

                for j in range(num_actions):
                    if max_overlaps[j] == 1.0:
                        gt_tubes_list[j].append((i, j))
            gt_tubes_list = [i for i in gt_tubes_list if i != []]
            if len(gt_tubes_list) != num_actions:
                print('len(gt_tubes_list :', len(gt_tubes_list))
                print('num_actions :', num_actions)
                print('boxes.cpu().numpy() :', boxes.cpu().numpy())

            # print('gt_tubes_list :',gt_tubes_list)
            ## concate fb, bg tubes
            if gt_tubes_list == [[]]:
                print('overlaps :', overlaps)
                print('max_overlaps :', max_overlaps)
                print('p_tubes :', p_tubes)
                print('f_gt_tubes :', f_gt_tubes)
                exit(-1)
            if bg_tubes_list != []:
                f_tubes = gt_tubes_list + bg_tubes_list
                target_lbl = torch.cat([labels, bg_labels], dim=0)
            else:
                f_tubes = gt_tubes_list
                target_lbl = labels

        # print('num_frames :',num_frames)
        # print('gt_tubes_list :',gt_tubes_list, ' labels :',labels)
        # print('f_tubes :',f_tubes, ' target_lbl :',target_lbl)
        ##############################################

        if len(f_tubes) == 0:
            print('------------------')
            print('    empty tube    ')
            print(' vid_id :', vid_id)
            print('self.calc.thresh :', self.calc.thresh)
            return torch.Tensor([]).cuda(), torch.Tensor([]).cuda(), None
        max_seq = reduce(lambda x, y: y if len(y) > len(x) else x, f_tubes)
        max_length = len(max_seq)

        ## calculate input rois
        ## f_feats.shape : [#f_tubes, max_length, 512]
        # final_video_tubes = torch.zeros(len(f_tubes),6).cuda()
        prob_out = torch.zeros(len(f_tubes), self.n_classes).cuda()
        # final_feats = []
        f_feats = torch.zeros(len(f_tubes), n_clips, 64,
                              self.sample_duration).type_as(features) - 1
        f_feats_len = torch.zeros(len(f_tubes)).type_as(features) - 1

        for i in range(len(f_tubes)):

            seq = f_tubes[i]

            # tmp_tube = torch.Tensor(len(seq),6)

            # feats = torch.Tensor(len(seq),self.p_feat_size)
            feats = torch.Tensor(len(seq), self.p_feat_size,
                                 self.sample_duration)

            for j in range(len(seq)):

                # feats[j] = features[seq[j][0],seq[j][1]].mean(1)
                feats[j] = features[seq[j][0], seq[j][1]]
                # tmp_tube[j] = p_tubes[seq[j]][1:7]

            f_feats_len[i] = len(seq)
            f_feats[i, :len(seq)] = feats
            prob_out[i] = self.act_rnn(
                feats.mean(0).view(1, -1).contiguous().cuda())

            # # feats = torch.mean(feats, dim=0)
            # if mode == 'extract':
            #     final_feats.append(feats)

            # try:
            #     prob_out[i] = self.act_rnn(feats.view(-1).cuda())
            # except Exception as e:
            #     print('feats.shape :',feats.shape)
            #     print('seq :',seq)
            #     for i in range(len(f_tubes)):
            #         print('seq[i] :',f_tubes[i])

            #     print('e :',e)
            #     exit(-1)
            # if prob_out[i,0] != prob_out[i,0]:
            #     print(' prob_out :', prob_out ,' feats :',feats.cpu().numpy(), ' numpy(), feats.shape  :,', feats.shape ,' target_lbl :',target_lbl, \
            #           ' \ntmp_tube :',tmp_tube, )
            #     exit(-1)

        if mode == 'extract':
            # now we use mean so we can have a tensor containing all features
            # final_tubes = final_tubes.cuda()
            target_lbl = target_lbl.cuda()
            # max_length = torch.Tensor([max_length]).cuda()
            return f_feats, target_lbl, f_feats_len
        # ##########################################
        # #           Time for Linear Loss         #
        # ##########################################

        cls_loss = torch.Tensor([0]).cuda()

        final_tubes = final_tubes.type_as(final_poss)
        # # classification probability
        if self.training:
            cls_loss = F.cross_entropy(prob_out.cpu(),
                                       target_lbl.long()).cuda()

        if self.training:
            return None, prob_out, cls_loss,
        else:

            # init padding tubes because of multi-GPU system
            if final_tubes.size(0) > conf.UPDATE_THRESH:
                _, indices = torch.sort(final_scores)
                final_tubes = final_tubes[
                    indices[:conf.UPDATE_THRESH]].contiguous()
                prob_out = prob_out[indices[:conf.UPDATE_THRESH]].contiguous()

            max_prob_out, _ = torch.max(prob_out, 1)

            f_tubes = torch.cat([
                final_tubes.view(-1, num_frames * 4),
                max_prob_out.view(-1, 1).type_as(final_tubes)
            ],
                                dim=1)

            keep = torch.Tensor(py_cpu_nms_tubes(f_tubes.float(), 0.5)).long()
            final_tubes = final_tubes[keep]
            prob_out = prob_out[keep]

            ret_tubes = torch.zeros(1, conf.UPDATE_THRESH, ret_n_frames,
                                    4).type_as(final_tubes).float() - 1
            ret_prob_out = torch.zeros(
                1, conf.UPDATE_THRESH,
                self.n_classes).type_as(final_tubes).float() - 1
            ret_tubes[0, :final_tubes.size(0), :num_frames] = final_tubes
            ret_prob_out[0, :final_tubes.size(0)] = prob_out
            return ret_tubes, ret_prob_out, torch.Tensor([final_tubes.size(0)
                                                          ]).cuda()

            # return final_tubes, prob_out, None

    def deactivate_action_net_grad(self):

        for p in self.act_net.parameters():
            p.requires_grad = False
        # self.act_net.eval()
        # for key, value in dict(self.named_parameters()).items():
        #     print(key, value.requires_grad)

    def load_part_model(self,
                        resnet_path=None,
                        action_model_path=None,
                        rnn_path=None):

        # load action net
        if action_model_path != None:

            act_data = torch.load(action_model_path)
            # act_data = torch.load('./action_net_model.pwf')

            ## to remove module
            new_state_dict = OrderedDict()
            for k, v in act_data.items():
                # if k.find('module') != -1 :
                name = k[7:]  # remove `module.`
                new_state_dict[name] = v

            act_net = ACT_net(self.classes, self.sample_duration)
            if resnet_path is not None:
                act_net.create_architecture(model_path=resnet_path)
            else:
                act_net.create_architecture()
            act_net.load_state_dict(new_state_dict)
            self.act_net = act_net

        else:
            self.act_net = ACT_net(self.classes, self.sample_duration)
            if resnet_path is not None:
                self.act_net.create_architecture(model_path=resnet_path)
            else:
                self.act_net.create_architecture()

        # load lstm
        if rnn_path != None:

            # act_rnn = Act_RNN(self.p_feat_size,int(self.p_feat_size/2),self.n_classes)
            # act_rnn_data = torch.load(rnn_path)
            # act_rnn.load_state_dict(act_rnn_data)

            act_rnn = nn.Sequential(
                # nn.Linear(64*self.sample_duration, 256),
                # nn.ReLU(True),
                # nn.Dropout(0.8),
                # nn.Linear(256,self.n_classes),
                nn.Linear(64 * self.sample_duration, self.n_classes),
                # nn.ReLU(True),
                # nn.Dropout(0.8),
                # nn.Linear(256,self.n_classes),
            )
            act_rnn_data = torch.load(rnn_path)
            act_rnn.load_state_dict(act_rnn_data)
            self.act_rnn = act_rnn

        else:
            # self.act_rnn =Act_RNN(self.p_feat_size,int(self.p_feat_size/2),self.n_classes)
            self.act_rnn = nn.Sequential(
                # nn.Linear(64*self.sample_duration, 256),
                # nn.ReLU(True),
                # nn.Dropout(0.8),
                # nn.Linear(256,self.n_classes),
                nn.Linear(64 * self.sample_duration, self.n_classes),
                # nn.ReLU(True),
                # nn.Dropout(0.8),
                # nn.Linear(256,self.n_classes),
            )
            for m in self.act_rnn.modules():
                if m == nn.Linear:
                    m.weight.data.normal_().fmod_(2).mul_(stddev).add_(
                        mean)  # not a perfect approximation
from lib.models.action_net import ACT_net

actions = [
    '__background__', 'Basketball', 'BasketballDunk', 'Biking', 'CliffDiving',
    'CricketBowling', 'Diving', 'Fencing', 'FloorGymnastics', 'GolfSwing',
    'HorseRiding', 'IceDancing', 'LongJump', 'PoleVault', 'RopeClimbing',
    'SalsaSpin', 'SkateBoarding', 'Skiing', 'Skijet', 'SoccerJuggling',
    'Surfing', 'TennisSwing', 'TrampolineJumping', 'VolleyballSpiking',
    'WalkingWithDog'
]

cls2idx = {actions[i]: i for i in range(0, len(actions))}
act_model = ACT_net(actions, 16)
    print(h,w)
    print('path :',path)
    print('clips.shape :',clips.shape)
    clips = clips.unsqueeze(0)
    gt_tubes = gt_tubes.unsqueeze(0)
    print('gt_tubes.shape :',gt_tubes.shape )
    clips = clips.to(device)
    gt_tubes_r = resize_tube(gt_tubes, torch.Tensor([h]),torch.Tensor([w]),sample_size).to(device)
    gt_tubes_r = gt_tubes_r.to(device)
    im_info = torch.Tensor([[sample_size, sample_size, sample_duration]] * gt_tubes_r.size(1)).to(device)

    n_classes = len(classes)
    resnet_shortcut = 'A'

    # Init action_net
    model = ACT_net(classes)
    model.create_architecture()
    data = model.act_rpn.RPN_cls_score.weight.data.clone()

    model_data = torch.load('../temporal_localization/jmdb_model.pwf')
    # model_data = torch.load('../temporal_localization/jmdb_model_020.pwf')
    # # model_data = torch.load('../temporal_localization/r')

    model.load_state_dict(model_data)

    model = nn.DataParallel(model)
    model.to(device)

    model.eval()
    print('im_info :',im_info)
    print('-----Starts-----')
Beispiel #13
0
                 classes_idx=cls2idx)
    data_loader = torch.utils.data.DataLoader(data,
                                              batch_size=batch_size,
                                              shuffle=True,
                                              num_workers=n_threads,
                                              pin_memory=True)

    n_classes = len(classes)
    resnet_shortcut = 'A'

    lr = 0.001
    lr_decay_step = 5
    lr_decay_gamma = 0.1

    # Init action_net
    model = ACT_net(classes)
    model.create_architecture()
    if torch.cuda.device_count() > 1:
        print('Using {} GPUs!'.format(torch.cuda.device_count()))

        model = nn.DataParallel(model)

    model.to(device)

    params = []
    for key, value in dict(model.named_parameters()).items():
        # print(key, value.requires_grad)
        if value.requires_grad:
            print('key :', key)
            if 'bias' in key:
                params += [{'params':[value],'lr':lr*(True + 1), \