Exemple #1
0
    def _get_train_label(self, index, anchor_xmin, anchor_xmax):
        video_name = self.video_list[index]
        video_info = self.video_dict[video_name]
        video_frame = video_info['duration_frame']
        video_second = video_info['duration_second']
        feature_frame = video_info['feature_frame']
        corrected_second = float(feature_frame) / video_frame * video_second  # there are some frames not used
        video_labels = video_info['annotations']  # the measurement is second, not frame

        ##############################################################################################
        # change the measurement from second to percentage
        gt_bbox = []
        gt_iou_map = []
        for j in range(len(video_labels)):
            tmp_info = video_labels[j]
            tmp_start = max(min(1, tmp_info['segment'][0] / corrected_second), 0)
            tmp_end = max(min(1, tmp_info['segment'][1] / corrected_second), 0)
            gt_bbox.append([tmp_start, tmp_end])

        ####################################################################################################
        # generate R_s and R_e
        gt_bbox = np.array(gt_bbox)
        gt_xmins = gt_bbox[:, 0]
        gt_xmaxs = gt_bbox[:, 1]
        gt_lens = gt_xmaxs - gt_xmins
        gt_len_small = 3 * self.temporal_gap  # np.maximum(self.temporal_gap, self.boundary_ratio * gt_lens)
        gt_start_bboxs = np.stack((gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)
        gt_end_bboxs = np.stack((gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)
        #####################################################################################################

        gt_iou_map = np.zeros([self.temporal_scale, self.temporal_scale])
        for i in range(self.temporal_scale):
            for j in range(i, self.temporal_scale):
                gt_iou_map[i, j] = np.max(
                    iou_with_anchors(i * self.temporal_gap, (j + 1) * self.temporal_gap, gt_xmins, gt_xmaxs))
        gt_iou_map = torch.Tensor(gt_iou_map)

        ##########################################################################################################
        # calculate the ioa for all timestamp
        match_score_start = []
        for jdx in range(len(anchor_xmin)):
            match_score_start.append(np.max(
                ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_start_bboxs[:, 0], gt_start_bboxs[:, 1])))
        match_score_end = []
        for jdx in range(len(anchor_xmin)):
            match_score_end.append(np.max(
                ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_end_bboxs[:, 0], gt_end_bboxs[:, 1])))
        match_score_start = torch.Tensor(match_score_start)
        match_score_end = torch.Tensor(match_score_end)
        ############################################################################################################

        return match_score_start, match_score_end, gt_iou_map
Exemple #2
0
    def _preparedata(self):
        print('wait...prepare data')
        for videoName in tqdm(self.videoNameList):
            video_annoDf = self.anno_df[self.anno_df.video == videoName]
            video_annoDf = video_annoDf[video_annoDf.type_idx != 0]  # 0 for Ambiguous

            gt_xmins = video_annoDf.startFrame.values[:]
            gt_xmaxs = video_annoDf.endFrame.values[:]
            gt_type_idx = video_annoDf.type_idx.values[:]

            rgb_feature, flow_feature = self._getVideoFeature(videoName, self.mode.lower())

            numSnippet = min(rgb_feature.shape[0], flow_feature.shape[0])
            frameList = [1 + self.unit_size * i for i in range(numSnippet)]
            df_data = np.concatenate((rgb_feature, flow_feature), axis=1)
            df_snippet = frameList
            window_size = self.window_size
            stride = self.window_step
            n_window = (numSnippet + stride - window_size) / stride
            windows_start = [i * stride for i in range(int(n_window))]
            if numSnippet < window_size:
                windows_start = [0]
                tmp_data = np.zeros((window_size - numSnippet, self.feature_dim))
                df_data = np.concatenate((df_data, tmp_data), axis=0)
                df_snippet.extend([df_snippet[-1] + self.unit_size * (i + 1) for i in range(window_size - numSnippet)])
            elif numSnippet - windows_start[-1] - window_size > 30:
                windows_start.append(numSnippet - window_size)

            snippet_xmin = df_snippet
            snippet_xmax = df_snippet[1:]
            snippet_xmax.append(df_snippet[-1] + self.unit_size)
            for start in windows_start:
                tmp_data = df_data[start:start + window_size, :]
                tmp_anchor_xmins = snippet_xmin[start:start + window_size]
                tmp_anchor_xmaxs = snippet_xmax[start:start + window_size]
                tmp_gt_bbox = []
                tmp_gt_class = []
                tmp_ioa_list = []
                for idx in range(len(gt_xmins)):
                    tmp_ioa = ioa_with_anchors(gt_xmins[idx], gt_xmaxs[idx], tmp_anchor_xmins[0], tmp_anchor_xmaxs[-1])
                    tmp_ioa_list.append(tmp_ioa)
                    if tmp_ioa > 0:
                        # gt bbox info
                        corrected_start = max(gt_xmins[idx], tmp_anchor_xmins[0]) - tmp_anchor_xmins[0]
                        corrected_end = min(gt_xmaxs[idx], tmp_anchor_xmaxs[-1]) - tmp_anchor_xmins[0]
                        tmp_gt_bbox.append([float(corrected_start) / (self.window_size * self.unit_size),
                                            float(corrected_end) / (self.window_size * self.unit_size)])
                        # gt class label
                        one_hot = [0] * self.num_classes
                        one_hot[self.class_real.index(gt_type_idx[idx])] = 1
                        tmp_gt_class.append(one_hot)
                if len(tmp_gt_bbox) > 0 and max(tmp_ioa_list) > self.ioa_ratio_threshold:
                    # the overlap region is corrected
                    tmp_results = [torch.transpose(torch.Tensor(tmp_data), 0, 1), np.array(tmp_gt_bbox),
                                   np.array(tmp_gt_class)]
                    self.sampels.append(tmp_results)
    def _get_train_label(self, index):
        video_id = self.video_ids[index]
        video_info = self.event_dict[video_id]
        video_labels = video_info[
            'events']  # the measurement is second, not frame
        duration = video_info['duration']

        ##############################################################################################
        # change the measurement from second to percentage
        gt_bbox = []
        gt_iou_map = []
        for j in range(len(video_labels)):
            tmp_info = video_labels[j]
            tmp_start = max(min(1, tmp_info['segment'][0] / duration), 0)
            tmp_end = max(min(1, tmp_info['segment'][1] / duration), 0)
            gt_bbox.append([tmp_start, tmp_end])
            tmp_gt_iou_map = iou_with_anchors(self.match_map[:, 0],
                                              self.match_map[:, 1], tmp_start,
                                              tmp_end)
            tmp_gt_iou_map = np.reshape(tmp_gt_iou_map,
                                        [self.max_duration, self.temporal_dim])
            gt_iou_map.append(tmp_gt_iou_map)
        gt_iou_map = np.array(gt_iou_map)
        gt_iou_map = np.max(gt_iou_map, axis=0)
        gt_iou_map = torch.Tensor(gt_iou_map)
        ##############################################################################################

        ##############################################################################################
        # generate R_s and R_e
        gt_bbox = np.array(gt_bbox)
        gt_xmins = gt_bbox[:, 0]
        gt_xmaxs = gt_bbox[:, 1]
        # gt_lens = gt_xmaxs - gt_xmins
        gt_len_small = 3 * self.temporal_gap  # np.maximum(self.temporal_gap, self.boundary_ratio * gt_lens)
        gt_start_bboxs = np.stack(
            (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)
        gt_end_bboxs = np.stack(
            (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)
        ##############################################################################################

        ##############################################################################################
        # calculate the ioa for all timestamp
        match_score_start = []
        for jdx in range(len(self.anchor_xmin)):
            match_score_start.append(
                np.max(
                    ioa_with_anchors(self.anchor_xmin[jdx],
                                     self.anchor_xmax[jdx],
                                     gt_start_bboxs[:, 0], gt_start_bboxs[:,
                                                                          1])))
        match_score_end = []
        for jdx in range(len(self.anchor_xmin)):
            match_score_end.append(
                np.max(
                    ioa_with_anchors(self.anchor_xmin[jdx],
                                     self.anchor_xmax[jdx], gt_end_bboxs[:, 0],
                                     gt_end_bboxs[:, 1])))
        match_score_start = torch.tensor(match_score_start)
        match_score_end = torch.tensor(match_score_end)
        ##############################################################################################

        return match_score_start, match_score_end, gt_iou_map
Exemple #4
0
    def _get_label(self, duration, timestamps, video_length):
        gt_bbox = []
        # print(duration)
        coefficient = 1 / float(duration)
        for timestamp in timestamps:
            # print(timestamp)
            start = max(min(float(timestamp[0]) * coefficient, 1), 0)
            end = min(max(float(timestamp[1]) * coefficient, 0), 1)
            gt_bbox.append([start, end])
        gt_bbox = np.array(gt_bbox)
        # print('gt_bbox', gt_bbox.shape)
        # print(gt_bbox)
        gt_xmins = gt_bbox[:, 0]
        gt_xmaxs = gt_bbox[:, 1]
        # print('gt_xmins')
        # print(gt_xmins)
        # print('gt_xmaxs')
        # print(gt_xmaxs)
        gt_lens = gt_xmaxs - gt_xmins
        # print('gt_lens')
        # print(gt_lens)
        gt_len_small = 3. / self.temporal_scale
        # print('gt_len_small')
        # print(gt_len_small)
        gt_start_bboxs = np.stack(
            (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1)
        gt_end_bboxs = np.stack(
            (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1)
        # print('gt_start_bboxs, gt_end_bboxs')
        # print(gt_start_bboxs)
        # print(gt_end_bboxs)

        confidence_map = np.zeros((self.temporal_scale, self.temporal_scale))
        # print('confidence_map', confidence_map.shape)
        for i in range(self.temporal_scale):
            for j in range(i, self.temporal_scale):
                confidence_map[i, j] = np.max(
                    iou_with_anchors(i / self.temporal_scale,
                                     (j + 1) / self.temporal_scale, gt_xmins,
                                     gt_xmaxs))
        confidence_map = torch.tensor(confidence_map)

        start_label_map = []
        end_label_map = []
        for xmin, xmax in zip(self.anchor_xmin, self.anchor_xmax):
            start_label_map.append(
                np.max(
                    ioa_with_anchors(xmin, xmax, gt_start_bboxs[:, 0],
                                     gt_start_bboxs[:, 1])))
            end_label_map.append(
                np.max(
                    ioa_with_anchors(xmin, xmax, gt_end_bboxs[:, 0],
                                     gt_end_bboxs[:, 1])))
        # print('start_label_map')
        # print(start_label_map)
        # print('end_label_map')
        # print(end_label_map)
        start_label_map = torch.tensor(start_label_map)
        end_label_map = torch.tensor(end_label_map)

        return start_label_map, end_label_map, confidence_map
Exemple #5
0
    def _get_train_label(self, index, anchor_xmin, anchor_xmax):
        video_name = self.video_list[index]
        video_info = self.video_dict[video_name]
        video_frame = video_info['duration_frame']  #1128
        video_second = video_info['duration_second']  #47.114
        feature_frame = video_info['feature_frame']  # 1120
        corrected_second = float(
            feature_frame
        ) / video_frame * video_second  # there are some frames not used  46.77
        video_labels = video_info[
            'annotations']  # the measurement is second, not frame  [{'segment':[0.01,37.11],'labels':'waxing skis'}]

        ##############################################################################################
        # change the measurement from second to percentage
        # 计算的是起止时间相对于视频时长的百分比
        gt_bbox = []
        gt_iou_map = []
        for j in range(len(video_labels)):
            tmp_info = video_labels[j]
            tmp_start = max(min(1, tmp_info['segment'][0] / corrected_second),
                            0)  # 0.00
            tmp_end = max(min(1, tmp_info['segment'][1] / corrected_second),
                          0)  # 0.79
            gt_bbox.append([tmp_start, tmp_end])
            # 计算当前gt_bbox与所有预设anchor的IOU
            tmp_gt_iou_map = iou_with_anchors(self.match_map[:, 0],
                                              self.match_map[:, 1], tmp_start,
                                              tmp_end)  #(10000,)
            # 这里reshape之后正好变成了行为不同的起止时间,列为不同持续时长的形式
            # 例如对于矩阵:
            #     [[1]
            #      [2]      ------>  [[1,2],
            #                         [3,4]]
            #      [3]
            #      [4]]
            tmp_gt_iou_map = np.reshape(
                tmp_gt_iou_map,
                [self.temporal_scale, self.temporal_scale])  #(100,100)
            gt_iou_map.append(tmp_gt_iou_map)

        # 相当于建立了一个字典保存所有可能的提议与gt_bbox的IOU值
        gt_iou_map = np.array(gt_iou_map)  # (1,100,100) 其中1表示gt_bbox的个数
        gt_iou_map = np.max(gt_iou_map,
                            axis=0)  # 如果存在多个gt_bbox,则选取最大IOU值作为iou_map中的值
        gt_iou_map = torch.Tensor(gt_iou_map)
        ##############################################################################################

        # 将gt的起止时间扩大为一个范围
        ####################################################################################################
        # generate R_s and R_e
        gt_bbox = np.array(gt_bbox)
        gt_xmins = gt_bbox[:, 0]
        gt_xmaxs = gt_bbox[:, 1]
        gt_lens = gt_xmaxs - gt_xmins
        gt_len_small = 3 * self.temporal_gap  # np.maximum(self.temporal_gap, self.boundary_ratio * gt_lens)
        # 间隔0.03
        gt_start_bboxs = np.stack(
            (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2),
            axis=1)  # [0.12,0.15]
        gt_end_bboxs = np.stack(
            (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2),
            axis=1)  # [0.85,0.88]
        #####################################################################################################

        # 不明白为啥label可以这样构建
        ##########################################################################################################
        # calculate the ioa for all timestamp
        # 计算每个0.02的小区间与gt_start和gt_end的重叠度
        match_score_start = []  # (100)
        for jdx in range(len(anchor_xmin)):
            match_score_start.append(
                np.max(
                    ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
                                     gt_start_bboxs[:, 0], gt_start_bboxs[:,
                                                                          1])))
        match_score_end = []  # (100)

        for jdx in range(len(anchor_xmin)):
            match_score_end.append(
                np.max(
                    ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx],
                                     gt_end_bboxs[:, 0], gt_end_bboxs[:, 1])))
        match_score_start = torch.Tensor(match_score_start)
        match_score_end = torch.Tensor(match_score_end)
        ############################################################################################################

        return match_score_start, match_score_end, gt_iou_map