def _get_train_label(self, index, anchor_xmin, anchor_xmax): video_name = self.video_list[index] video_info = self.video_dict[video_name] video_frame = video_info['duration_frame'] video_second = video_info['duration_second'] feature_frame = video_info['feature_frame'] corrected_second = float(feature_frame) / video_frame * video_second # there are some frames not used video_labels = video_info['annotations'] # the measurement is second, not frame ############################################################################################## # change the measurement from second to percentage gt_bbox = [] gt_iou_map = [] for j in range(len(video_labels)): tmp_info = video_labels[j] tmp_start = max(min(1, tmp_info['segment'][0] / corrected_second), 0) tmp_end = max(min(1, tmp_info['segment'][1] / corrected_second), 0) gt_bbox.append([tmp_start, tmp_end]) #################################################################################################### # generate R_s and R_e gt_bbox = np.array(gt_bbox) gt_xmins = gt_bbox[:, 0] gt_xmaxs = gt_bbox[:, 1] gt_lens = gt_xmaxs - gt_xmins gt_len_small = 3 * self.temporal_gap # np.maximum(self.temporal_gap, self.boundary_ratio * gt_lens) gt_start_bboxs = np.stack((gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1) gt_end_bboxs = np.stack((gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1) ##################################################################################################### gt_iou_map = np.zeros([self.temporal_scale, self.temporal_scale]) for i in range(self.temporal_scale): for j in range(i, self.temporal_scale): gt_iou_map[i, j] = np.max( iou_with_anchors(i * self.temporal_gap, (j + 1) * self.temporal_gap, gt_xmins, gt_xmaxs)) gt_iou_map = torch.Tensor(gt_iou_map) ########################################################################################################## # calculate the ioa for all timestamp match_score_start = [] for jdx in range(len(anchor_xmin)): match_score_start.append(np.max( ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_start_bboxs[:, 0], gt_start_bboxs[:, 1]))) match_score_end = [] for jdx in range(len(anchor_xmin)): match_score_end.append(np.max( ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_end_bboxs[:, 0], gt_end_bboxs[:, 1]))) match_score_start = torch.Tensor(match_score_start) match_score_end = torch.Tensor(match_score_end) ############################################################################################################ return match_score_start, match_score_end, gt_iou_map
def soft_nms(df, alpha, t1, t2): ''' df: proposals generated by network; alpha: alpha value of Gaussian decaying function; t1, t2: threshold for soft nms. ''' df = df.sort_values(by="score", ascending=False) # 按得分降序排列 tstart = list(df.xmin.values[:]) tend = list(df.xmax.values[:]) tscore = list(df.score.values[:]) rstart = [] rend = [] rscore = [] # 每个视频获取前100个提议 while len(tscore) > 1 and len(rscore) < 101: max_index = tscore.index(max(tscore)) tmp_iou_list = iou_with_anchors(np.array(tstart), np.array(tend), tstart[max_index], tend[max_index]) for idx in range(0, len(tscore)): if idx != max_index: tmp_iou = tmp_iou_list[idx] tmp_width = tend[max_index] - tstart[max_index] if tmp_iou > t1 + (t2 - t1) * tmp_width: tscore[idx] = tscore[idx] * np.exp( -np.square(tmp_iou) / alpha) rstart.append(tstart[max_index]) rend.append(tend[max_index]) rscore.append(tscore[max_index]) tstart.pop(max_index) tend.pop(max_index) tscore.pop(max_index) newDf = pd.DataFrame() newDf['score'] = rscore newDf['xmin'] = rstart newDf['xmax'] = rend return newDf
def _get_train_label(self, index): video_id = self.video_ids[index] video_info = self.event_dict[video_id] video_labels = video_info[ 'events'] # the measurement is second, not frame duration = video_info['duration'] ############################################################################################## # change the measurement from second to percentage gt_bbox = [] gt_iou_map = [] for j in range(len(video_labels)): tmp_info = video_labels[j] tmp_start = max(min(1, tmp_info['segment'][0] / duration), 0) tmp_end = max(min(1, tmp_info['segment'][1] / duration), 0) gt_bbox.append([tmp_start, tmp_end]) tmp_gt_iou_map = iou_with_anchors(self.match_map[:, 0], self.match_map[:, 1], tmp_start, tmp_end) tmp_gt_iou_map = np.reshape(tmp_gt_iou_map, [self.max_duration, self.temporal_dim]) gt_iou_map.append(tmp_gt_iou_map) gt_iou_map = np.array(gt_iou_map) gt_iou_map = np.max(gt_iou_map, axis=0) gt_iou_map = torch.Tensor(gt_iou_map) ############################################################################################## ############################################################################################## # generate R_s and R_e gt_bbox = np.array(gt_bbox) gt_xmins = gt_bbox[:, 0] gt_xmaxs = gt_bbox[:, 1] # gt_lens = gt_xmaxs - gt_xmins gt_len_small = 3 * self.temporal_gap # np.maximum(self.temporal_gap, self.boundary_ratio * gt_lens) gt_start_bboxs = np.stack( (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1) gt_end_bboxs = np.stack( (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1) ############################################################################################## ############################################################################################## # calculate the ioa for all timestamp match_score_start = [] for jdx in range(len(self.anchor_xmin)): match_score_start.append( np.max( ioa_with_anchors(self.anchor_xmin[jdx], self.anchor_xmax[jdx], gt_start_bboxs[:, 0], gt_start_bboxs[:, 1]))) match_score_end = [] for jdx in range(len(self.anchor_xmin)): match_score_end.append( np.max( ioa_with_anchors(self.anchor_xmin[jdx], self.anchor_xmax[jdx], gt_end_bboxs[:, 0], gt_end_bboxs[:, 1]))) match_score_start = torch.tensor(match_score_start) match_score_end = torch.tensor(match_score_end) ############################################################################################## return match_score_start, match_score_end, gt_iou_map
def _get_label(self, duration, timestamps, video_length): gt_bbox = [] # print(duration) coefficient = 1 / float(duration) for timestamp in timestamps: # print(timestamp) start = max(min(float(timestamp[0]) * coefficient, 1), 0) end = min(max(float(timestamp[1]) * coefficient, 0), 1) gt_bbox.append([start, end]) gt_bbox = np.array(gt_bbox) # print('gt_bbox', gt_bbox.shape) # print(gt_bbox) gt_xmins = gt_bbox[:, 0] gt_xmaxs = gt_bbox[:, 1] # print('gt_xmins') # print(gt_xmins) # print('gt_xmaxs') # print(gt_xmaxs) gt_lens = gt_xmaxs - gt_xmins # print('gt_lens') # print(gt_lens) gt_len_small = 3. / self.temporal_scale # print('gt_len_small') # print(gt_len_small) gt_start_bboxs = np.stack( (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1) gt_end_bboxs = np.stack( (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1) # print('gt_start_bboxs, gt_end_bboxs') # print(gt_start_bboxs) # print(gt_end_bboxs) confidence_map = np.zeros((self.temporal_scale, self.temporal_scale)) # print('confidence_map', confidence_map.shape) for i in range(self.temporal_scale): for j in range(i, self.temporal_scale): confidence_map[i, j] = np.max( iou_with_anchors(i / self.temporal_scale, (j + 1) / self.temporal_scale, gt_xmins, gt_xmaxs)) confidence_map = torch.tensor(confidence_map) start_label_map = [] end_label_map = [] for xmin, xmax in zip(self.anchor_xmin, self.anchor_xmax): start_label_map.append( np.max( ioa_with_anchors(xmin, xmax, gt_start_bboxs[:, 0], gt_start_bboxs[:, 1]))) end_label_map.append( np.max( ioa_with_anchors(xmin, xmax, gt_end_bboxs[:, 0], gt_end_bboxs[:, 1]))) # print('start_label_map') # print(start_label_map) # print('end_label_map') # print(end_label_map) start_label_map = torch.tensor(start_label_map) end_label_map = torch.tensor(end_label_map) return start_label_map, end_label_map, confidence_map
def _get_train_label(self, index, anchor_xmin, anchor_xmax): video_name = self.video_list[index] video_info = self.video_dict[video_name] video_frame = video_info['duration_frame'] #1128 video_second = video_info['duration_second'] #47.114 feature_frame = video_info['feature_frame'] # 1120 corrected_second = float( feature_frame ) / video_frame * video_second # there are some frames not used 46.77 video_labels = video_info[ 'annotations'] # the measurement is second, not frame [{'segment':[0.01,37.11],'labels':'waxing skis'}] ############################################################################################## # change the measurement from second to percentage # 计算的是起止时间相对于视频时长的百分比 gt_bbox = [] gt_iou_map = [] for j in range(len(video_labels)): tmp_info = video_labels[j] tmp_start = max(min(1, tmp_info['segment'][0] / corrected_second), 0) # 0.00 tmp_end = max(min(1, tmp_info['segment'][1] / corrected_second), 0) # 0.79 gt_bbox.append([tmp_start, tmp_end]) # 计算当前gt_bbox与所有预设anchor的IOU tmp_gt_iou_map = iou_with_anchors(self.match_map[:, 0], self.match_map[:, 1], tmp_start, tmp_end) #(10000,) # 这里reshape之后正好变成了行为不同的起止时间,列为不同持续时长的形式 # 例如对于矩阵: # [[1] # [2] ------> [[1,2], # [3,4]] # [3] # [4]] tmp_gt_iou_map = np.reshape( tmp_gt_iou_map, [self.temporal_scale, self.temporal_scale]) #(100,100) gt_iou_map.append(tmp_gt_iou_map) # 相当于建立了一个字典保存所有可能的提议与gt_bbox的IOU值 gt_iou_map = np.array(gt_iou_map) # (1,100,100) 其中1表示gt_bbox的个数 gt_iou_map = np.max(gt_iou_map, axis=0) # 如果存在多个gt_bbox,则选取最大IOU值作为iou_map中的值 gt_iou_map = torch.Tensor(gt_iou_map) ############################################################################################## # 将gt的起止时间扩大为一个范围 #################################################################################################### # generate R_s and R_e gt_bbox = np.array(gt_bbox) gt_xmins = gt_bbox[:, 0] gt_xmaxs = gt_bbox[:, 1] gt_lens = gt_xmaxs - gt_xmins gt_len_small = 3 * self.temporal_gap # np.maximum(self.temporal_gap, self.boundary_ratio * gt_lens) # 间隔0.03 gt_start_bboxs = np.stack( (gt_xmins - gt_len_small / 2, gt_xmins + gt_len_small / 2), axis=1) # [0.12,0.15] gt_end_bboxs = np.stack( (gt_xmaxs - gt_len_small / 2, gt_xmaxs + gt_len_small / 2), axis=1) # [0.85,0.88] ##################################################################################################### # 不明白为啥label可以这样构建 ########################################################################################################## # calculate the ioa for all timestamp # 计算每个0.02的小区间与gt_start和gt_end的重叠度 match_score_start = [] # (100) for jdx in range(len(anchor_xmin)): match_score_start.append( np.max( ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_start_bboxs[:, 0], gt_start_bboxs[:, 1]))) match_score_end = [] # (100) for jdx in range(len(anchor_xmin)): match_score_end.append( np.max( ioa_with_anchors(anchor_xmin[jdx], anchor_xmax[jdx], gt_end_bboxs[:, 0], gt_end_bboxs[:, 1]))) match_score_start = torch.Tensor(match_score_start) match_score_end = torch.Tensor(match_score_end) ############################################################################################################ return match_score_start, match_score_end, gt_iou_map