def _get_train_label(self, index, anchor_xmin, anchor_xmax): video_name = self.video_list[index] video_info = self.video_dict[video_name] video_frame = video_info['duration_frame'] video_second = video_info['duration_second'] feature_frame = video_info['feature_frame'] corrected_second = float(feature_frame) / video_frame * video_second # there are some frames not used video_labels = video_info['annotations'] # the measurement is second, not frame ############################################################################################## # change the measurement from second to percentage gt_bbox = [] gt_iou_map = [] for j in range(len(video_labels)): tmp_info = video_labels[j] tmp_start = max(min(1, tmp_info['segment'][0] / corrected_second), 0) tmp_end = max(min(1, tmp_info['segment'][1] / corrected_second), 0) gt_bbox.append([tmp_start, tmp_end]) #################################################################################################### # generate R_s and R_e gt_bbox = np.array(gt_bbox) gt_xmins = gt_bbox[:, 0] gt_xmaxs = gt_bbox[:, 1] gt_lens = gt_xmaxs - gt_xmins gt_len_small = 3 * self.temporal_gap # np.maximum(self.temporal_gap, self.boundary_ratio * gt_lens) gt_start_bboxs = np.stack((gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1) gt_end_bboxs = np.stack((gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1) ##################################################################################################### gt_iou_map = np.zeros([self.temporal_scale, self.temporal_scale]) for i in range(self.temporal_scale): for j in range(i, self.temporal_scale): gt_iou_map[i, j] = np.max( iou_with_anchors(i * self.temporal_gap, (j + 1) * self.temporal_gap, gt_xmins, gt_xmaxs)) gt_iou_map = torch.Tensor(gt_iou_map) ########################################################################################################## # calculate the ioa for all timestamp match_score_start = [] for jdx in range(len(anchor_xmin)): match_score_start.append(np.max( ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_start_bboxs[:, 0], gt_start_bboxs[:, 1]))) match_score_end = [] for jdx in range(len(anchor_xmin)): match_score_end.append(np.max( ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_end_bboxs[:, 0], gt_end_bboxs[:, 1]))) match_score_start = torch.Tensor(match_score_start) match_score_end = torch.Tensor(match_score_end) ############################################################################################################ return match_score_start, match_score_end, gt_iou_map
def _preparedata(self): print('wait...prepare data') for videoName in tqdm(self.videoNameList): video_annoDf = self.anno_df[self.anno_df.video == videoName] video_annoDf = video_annoDf[video_annoDf.type_idx != 0] # 0 for Ambiguous gt_xmins = video_annoDf.startFrame.values[:] gt_xmaxs = video_annoDf.endFrame.values[:] gt_type_idx = video_annoDf.type_idx.values[:] rgb_feature, flow_feature = self._getVideoFeature(videoName, self.mode.lower()) numSnippet = min(rgb_feature.shape[0], flow_feature.shape[0]) frameList = [1 + self.unit_size * i for i in range(numSnippet)] df_data = np.concatenate((rgb_feature, flow_feature), axis=1) df_snippet = frameList window_size = self.window_size stride = self.window_step n_window = (numSnippet + stride - window_size) / stride windows_start = [i * stride for i in range(int(n_window))] if numSnippet < window_size: windows_start = [0] tmp_data = np.zeros((window_size - numSnippet, self.feature_dim)) df_data = np.concatenate((df_data, tmp_data), axis=0) df_snippet.extend([df_snippet[-1] + self.unit_size * (i + 1) for i in range(window_size - numSnippet)]) elif numSnippet - windows_start[-1] - window_size > 30: windows_start.append(numSnippet - window_size) snippet_xmin = df_snippet snippet_xmax = df_snippet[1:] snippet_xmax.append(df_snippet[-1] + self.unit_size) for start in windows_start: tmp_data = df_data[start:start + window_size, :] tmp_anchor_xmins = snippet_xmin[start:start + window_size] tmp_anchor_xmaxs = snippet_xmax[start:start + window_size] tmp_gt_bbox = [] tmp_gt_class = [] tmp_ioa_list = [] for idx in range(len(gt_xmins)): tmp_ioa = ioa_with_anchors(gt_xmins[idx], gt_xmaxs[idx], tmp_anchor_xmins[0], tmp_anchor_xmaxs[-1]) tmp_ioa_list.append(tmp_ioa) if tmp_ioa > 0: # gt bbox info corrected_start = max(gt_xmins[idx], tmp_anchor_xmins[0]) - tmp_anchor_xmins[0] corrected_end = min(gt_xmaxs[idx], tmp_anchor_xmaxs[-1]) - tmp_anchor_xmins[0] tmp_gt_bbox.append([float(corrected_start) / (self.window_size * self.unit_size), float(corrected_end) / (self.window_size * self.unit_size)]) # gt class label one_hot = [0] * self.num_classes one_hot[self.class_real.index(gt_type_idx[idx])] = 1 tmp_gt_class.append(one_hot) if len(tmp_gt_bbox) > 0 and max(tmp_ioa_list) > self.ioa_ratio_threshold: # the overlap region is corrected tmp_results = [torch.transpose(torch.Tensor(tmp_data), 0, 1), np.array(tmp_gt_bbox), np.array(tmp_gt_class)] self.sampels.append(tmp_results)
def _get_train_label(self, index): video_id = self.video_ids[index] video_info = self.event_dict[video_id] video_labels = video_info[ 'events'] # the measurement is second, not frame duration = video_info['duration'] ############################################################################################## # change the measurement from second to percentage gt_bbox = [] gt_iou_map = [] for j in range(len(video_labels)): tmp_info = video_labels[j] tmp_start = max(min(1, tmp_info['segment'][0] / duration), 0) tmp_end = max(min(1, tmp_info['segment'][1] / duration), 0) gt_bbox.append([tmp_start, tmp_end]) tmp_gt_iou_map = iou_with_anchors(self.match_map[:, 0], self.match_map[:, 1], tmp_start, tmp_end) tmp_gt_iou_map = np.reshape(tmp_gt_iou_map, [self.max_duration, self.temporal_dim]) gt_iou_map.append(tmp_gt_iou_map) gt_iou_map = np.array(gt_iou_map) gt_iou_map = np.max(gt_iou_map, axis=0) gt_iou_map = torch.Tensor(gt_iou_map) ############################################################################################## ############################################################################################## # generate R_s and R_e gt_bbox = np.array(gt_bbox) gt_xmins = gt_bbox[:, 0] gt_xmaxs = gt_bbox[:, 1] # gt_lens = gt_xmaxs - gt_xmins gt_len_small = 3 * self.temporal_gap # np.maximum(self.temporal_gap, self.boundary_ratio * gt_lens) gt_start_bboxs = np.stack( (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1) gt_end_bboxs = np.stack( (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1) ############################################################################################## ############################################################################################## # calculate the ioa for all timestamp match_score_start = [] for jdx in range(len(self.anchor_xmin)): match_score_start.append( np.max( ioa_with_anchors(self.anchor_xmin[jdx], self.anchor_xmax[jdx], gt_start_bboxs[:, 0], gt_start_bboxs[:, 1]))) match_score_end = [] for jdx in range(len(self.anchor_xmin)): match_score_end.append( np.max( ioa_with_anchors(self.anchor_xmin[jdx], self.anchor_xmax[jdx], gt_end_bboxs[:, 0], gt_end_bboxs[:, 1]))) match_score_start = torch.tensor(match_score_start) match_score_end = torch.tensor(match_score_end) ############################################################################################## return match_score_start, match_score_end, gt_iou_map
def _get_label(self, duration, timestamps, video_length): gt_bbox = [] # print(duration) coefficient = 1 / float(duration) for timestamp in timestamps: # print(timestamp) start = max(min(float(timestamp[0]) * coefficient, 1), 0) end = min(max(float(timestamp[1]) * coefficient, 0), 1) gt_bbox.append([start, end]) gt_bbox = np.array(gt_bbox) # print('gt_bbox', gt_bbox.shape) # print(gt_bbox) gt_xmins = gt_bbox[:, 0] gt_xmaxs = gt_bbox[:, 1] # print('gt_xmins') # print(gt_xmins) # print('gt_xmaxs') # print(gt_xmaxs) gt_lens = gt_xmaxs - gt_xmins # print('gt_lens') # print(gt_lens) gt_len_small = 3. / self.temporal_scale # print('gt_len_small') # print(gt_len_small) gt_start_bboxs = np.stack( (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1) gt_end_bboxs = np.stack( (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1) # print('gt_start_bboxs, gt_end_bboxs') # print(gt_start_bboxs) # print(gt_end_bboxs) confidence_map = np.zeros((self.temporal_scale, self.temporal_scale)) # print('confidence_map', confidence_map.shape) for i in range(self.temporal_scale): for j in range(i, self.temporal_scale): confidence_map[i, j] = np.max( iou_with_anchors(i / self.temporal_scale, (j + 1) / self.temporal_scale, gt_xmins, gt_xmaxs)) confidence_map = torch.tensor(confidence_map) start_label_map = [] end_label_map = [] for xmin, xmax in zip(self.anchor_xmin, self.anchor_xmax): start_label_map.append( np.max( ioa_with_anchors(xmin, xmax, gt_start_bboxs[:, 0], gt_start_bboxs[:, 1]))) end_label_map.append( np.max( ioa_with_anchors(xmin, xmax, gt_end_bboxs[:, 0], gt_end_bboxs[:, 1]))) # print('start_label_map') # print(start_label_map) # print('end_label_map') # print(end_label_map) start_label_map = torch.tensor(start_label_map) end_label_map = torch.tensor(end_label_map) return start_label_map, end_label_map, confidence_map
def _get_train_label(self, index, anchor_xmin, anchor_xmax): video_name = self.video_list[index] video_info = self.video_dict[video_name] video_frame = video_info['duration_frame'] #1128 video_second = video_info['duration_second'] #47.114 feature_frame = video_info['feature_frame'] # 1120 corrected_second = float( feature_frame ) / video_frame * video_second # there are some frames not used 46.77 video_labels = video_info[ 'annotations'] # the measurement is second, not frame [{'segment':[0.01,37.11],'labels':'waxing skis'}] ############################################################################################## # change the measurement from second to percentage # 计算的是起止时间相对于视频时长的百分比 gt_bbox = [] gt_iou_map = [] for j in range(len(video_labels)): tmp_info = video_labels[j] tmp_start = max(min(1, tmp_info['segment'][0] / corrected_second), 0) # 0.00 tmp_end = max(min(1, tmp_info['segment'][1] / corrected_second), 0) # 0.79 gt_bbox.append([tmp_start, tmp_end]) # 计算当前gt_bbox与所有预设anchor的IOU tmp_gt_iou_map = iou_with_anchors(self.match_map[:, 0], self.match_map[:, 1], tmp_start, tmp_end) #(10000,) # 这里reshape之后正好变成了行为不同的起止时间,列为不同持续时长的形式 # 例如对于矩阵: # [[1] # [2] ------> [[1,2], # [3,4]] # [3] # [4]] tmp_gt_iou_map = np.reshape( tmp_gt_iou_map, [self.temporal_scale, self.temporal_scale]) #(100,100) gt_iou_map.append(tmp_gt_iou_map) # 相当于建立了一个字典保存所有可能的提议与gt_bbox的IOU值 gt_iou_map = np.array(gt_iou_map) # (1,100,100) 其中1表示gt_bbox的个数 gt_iou_map = np.max(gt_iou_map, axis=0) # 如果存在多个gt_bbox,则选取最大IOU值作为iou_map中的值 gt_iou_map = torch.Tensor(gt_iou_map) ############################################################################################## # 将gt的起止时间扩大为一个范围 #################################################################################################### # generate R_s and R_e gt_bbox = np.array(gt_bbox) gt_xmins = gt_bbox[:, 0] gt_xmaxs = gt_bbox[:, 1] gt_lens = gt_xmaxs - gt_xmins gt_len_small = 3 * self.temporal_gap # np.maximum(self.temporal_gap, self.boundary_ratio * gt_lens) # 间隔0.03 gt_start_bboxs = np.stack( (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1) # [0.12,0.15] gt_end_bboxs = np.stack( (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1) # [0.85,0.88] ##################################################################################################### # 不明白为啥label可以这样构建 ########################################################################################################## # calculate the ioa for all timestamp # 计算每个0.02的小区间与gt_start和gt_end的重叠度 match_score_start = [] # (100) for jdx in range(len(anchor_xmin)): match_score_start.append( np.max( ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_start_bboxs[:, 0], gt_start_bboxs[:, 1]))) match_score_end = [] # (100) for jdx in range(len(anchor_xmin)): match_score_end.append( np.max( ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_end_bboxs[:, 0], gt_end_bboxs[:, 1]))) match_score_start = torch.Tensor(match_score_start) match_score_end = torch.Tensor(match_score_end) ############################################################################################################ return match_score_start, match_score_end, gt_iou_map