def compute_targets(ground_truth, proposals, pos_thresh=0.7, neg_thresh=0.3): # Get list of videos. video_set = set(ground_truth['video-id'].unique()).intersection( proposals['video-id'].unique()) # Adaptation to query faster ground_truth_gbvn = ground_truth.groupby('video-id') proposals_gbvn = proposals.groupby('video-id') proposal_targets = proposals.copy() # label -1 for ignore proposal_labels = np.full(proposal_targets.shape[0], -1) proposal_tiou = np.full(proposal_targets.shape[0], -1.0) # For each video, compute tiou scores among the proposals and ground_truth for videoid in video_set: ground_truth_videoid = ground_truth_gbvn.get_group(videoid) this_video_ground_truth_idx = ground_truth_videoid.reset_index() this_video_ground_truth = this_video_ground_truth_idx.loc[:, [ 't-start', 't-end' ]].values proposals_videoid = proposals_gbvn.get_group(videoid) this_video_proposals = proposals_videoid.loc[:, ['t-start', 't-end' ]].values this_video_proposals_idx = proposals_videoid.loc[:, ['t-start', 't-end' ]].index for idx, this_proposal in enumerate(this_video_proposals): tiou = segment_iou(this_proposal, this_video_ground_truth) argmax = tiou.argmax() if tiou[argmax] > pos_thresh: proposal_labels[this_video_proposals_idx[ idx]] = this_video_ground_truth_idx.label[ argmax] # foreground elif tiou[argmax] < neg_thresh: proposal_labels[ this_video_proposals_idx[idx]] = 0 # background proposal_tiou[this_video_proposals_idx[idx]] = tiou[argmax] # Select samples according a criterion pos_idxs = np.where(proposal_labels > 0)[0] num_pos = pos_idxs.shape[0] neg_idxs = np.where(proposal_labels == 0)[0] class_num = max(proposal_labels) num_neg = min(int(num_pos / class_num), neg_idxs.shape[0]) neg_idxs = np.random.permutation(neg_idxs) proposal_labels[neg_idxs[num_neg:]] = -1 proposal_targets['label'] = proposal_labels proposal_targets['tiou'] = proposal_tiou proposal_targets = proposal_targets[ proposal_targets.label != -1].reset_index() if DEBUG: for l in range(201): num = sum(proposal_targets['label'].values == l) print(num, " samples for class ", l) return proposal_targets
def _get_pos_neg(split_path, annotations, vid, slide_window_size, sampling_sec, anc_len_all, anc_cen_all, pos_thresh, neg_thresh): if os.path.isfile(os.path.join(split_path, vid + '_bn.npy')): print('video: {}'.format(vid)) video_prefix = os.path.join(split_path, vid) # load feature # T x H # as the multiprocessing cause error I change the method with numpy # resnet_feat = torch.from_numpy( # np.load(video_prefix + '_resnet.npy')).float() # bn_feat = torch.from_numpy( # np.load(video_prefix + '_bn.npy')).float() # if resnet_feat.size(0) != bn_feat.size(0): # raise Exception( # 'number of frames does not match in feature!') # total_frame = bn_feat.size(0) resnet_feat = np.load(video_prefix + '_resnet.npy') bn_feat = np.load(video_prefix + '_bn.npy') if resnet_feat.shape[0] != bn_feat.shape[0]: raise Exception('number of frames does not match in feature!') total_frame = bn_feat.shape[0] window_start = 0 window_end = slide_window_size window_start_t = window_start * sampling_sec window_end_t = window_end * sampling_sec pos_seg = defaultdict(list) neg_overlap = [0] * anc_len_all.shape[0] pos_collected = [False] * anc_len_all.shape[0] for j in range(anc_len_all.shape[0]): potential_match = [] for ann_idx, ann in enumerate(annotations): seg = ann['segment'] gt_start = seg[0] / sampling_sec gt_end = seg[1] / sampling_sec if gt_start > gt_end: gt_start, gt_end = gt_end, gt_start if anc_cen_all[j] + anc_len_all[j] / 2. <= total_frame: if window_start_t <= seg[ 0] and window_end_t + sampling_sec * 2 >= \ seg[1]: overlap = segment_iou( np.array([gt_start, gt_end]), np.array([[ anc_cen_all[j] - anc_len_all[j] / 2., anc_cen_all[j] + anc_len_all[j] / 2. ]])) neg_overlap[j] = max(overlap, neg_overlap[j]) if not pos_collected[j] and overlap >= pos_thresh: len_offset = math.log( (gt_end - gt_start) / anc_len_all[j]) cen_offset = ((gt_end + gt_start) / 2. - anc_cen_all[j]) / anc_len_all[j] potential_match.append( (ann_idx, j, overlap, len_offset, cen_offset, ann['sentence_idx'])) pos_collected[j] = True filled = False for item in potential_match: if item[0] not in pos_seg: filled = True pos_seg[item[0]].append(tuple(item[1:])) break if not filled and len(potential_match) > 0: # randomly choose one shuffle(potential_match) item = potential_match[0] pos_seg[item[0]].append(tuple(item[1:])) missing_prop = 0 if len(pos_seg.keys()) != len(annotations): print('Some annotations in video {} does not have ' 'any matching proposal'.format(video_prefix)) missing_prop = len(annotations) - len(pos_seg.keys()) neg_seg = [] for oi, overlap in enumerate(neg_overlap): if overlap < neg_thresh: neg_seg.append((oi, overlap)) npos_seg = 0 for k in pos_seg: npos_seg += len(pos_seg[k]) print('pos anc: {}, neg anc: {}'.format(npos_seg, len(neg_seg))) return video_prefix, total_frame, pos_seg, neg_seg, missing_prop else: return None
def inference(self, x, actual_frame_length, sampling_sec, min_prop_num, max_prop_num, min_prop_num_before_nms, pos_thresh, stride_factor, gated_mask=False): B, T, _ = x.size() dtype = x.data.type() x_rgb, x_flow = torch.split(x, 2048, 2) x_rgb = self.rgb_emb(x_rgb.contiguous()) x_flow = self.flow_emb(x_flow.contiguous()) x = torch.cat((x_rgb, x_flow), 2) x = self.emb_out(x) vis_feat, all_emb = self.vis_emb(x) # vis_feat = self.vis_dropout(vis_feat) # B x T x H -> B x H x T # for 1d conv vis_feat = vis_feat.transpose(1,2).contiguous() prop_lst = [] for i, kernel in enumerate(self.prop_out): kernel_size = self.kernel_list[i] if kernel_size <= actual_frame_length[0]: # no need to use larger kernel size in this case, batch size is only 1 pred_o = kernel(vis_feat) anchor_c = Variable(torch.FloatTensor(np.arange( float(kernel_size)/2.0, float(T+1-kernel_size/2.0), math.ceil(kernel_size/stride_factor) )).type(dtype)) if anchor_c.size(0) != pred_o.size(-1): raise Exception("size mismatch!") anchor_c = anchor_c.expand(B, 1, anchor_c.size(0)) anchor_l = Variable(torch.FloatTensor(anchor_c.size()).fill_(kernel_size).type(dtype)) pred_final = torch.cat((pred_o, anchor_l, anchor_c), 1) prop_lst.append(pred_final) else: print('skipping kernel sizes greater than {}'.format( self.kernel_list[i])) break prop_all = torch.cat(prop_lst, 2) # assume 1st and 2nd are action prediction and overlap, respectively prop_all[:,:2,:] = F.sigmoid(prop_all[:,:2,:]) pred_len = prop_all[:, 4, :] * torch.exp(prop_all[:, 2, :]) pred_cen = prop_all[:, 5, :] + prop_all[:, 4, :] * prop_all[:, 3, :] nms_thresh_set = np.arange(0.9, 0.95, 0.05).tolist() all_proposal_results = [] # store positional encodings, size of B x 4, # the first B values are predicted starts, # second B values are predicted ends, # third B values are anchor starts, # last B values are anchor ends pred_start_lst = [] #torch.zeros(B * 4).type(dtype) pred_end_lst = [] anchor_start_lst = [] anchor_end_lst = [] anchor_window_mask = [] #Variable(torch.zeros(B, T).type(dtype)) gate_scores = [] #Variable(torch.zeros(B, 1).type(dtype)) for b in range(B): crt_pred = prop_all.data[b] crt_pred_cen = pred_cen.data[b] crt_pred_len = pred_len.data[b] pred_masks = [] batch_result = [] crt_nproposal = 0 nproposal = torch.sum(torch.gt(prop_all.data[b, 0, :], pos_thresh)) nproposal = min(max(nproposal, min_prop_num_before_nms), prop_all.size(-1)) pred_results = np.empty((nproposal, 3)) _, sel_idx = torch.topk(crt_pred[0], nproposal) start_t = time.time() for nms_thresh in nms_thresh_set: for prop_idx in range(nproposal): original_frame_len = actual_frame_length[b].item() + sampling_sec*2 # might be truncated at the end, hence + frame_to_second*2 pred_start_w = crt_pred_cen[sel_idx[prop_idx]] - crt_pred_len[sel_idx[prop_idx]] / 2.0 pred_end_w = crt_pred_cen[sel_idx[prop_idx]] + crt_pred_len[sel_idx[prop_idx]] / 2.0 pred_start = pred_start_w pred_end = pred_end_w if pred_start >= pred_end: continue if pred_end >= original_frame_len or pred_start < 0: continue hasoverlap = False if crt_nproposal > 0: if np.max(segment_iou(np.array([pred_start, pred_end]), pred_results[:crt_nproposal])) > nms_thresh: hasoverlap = True if not hasoverlap: pred_bin_window_mask = torch.zeros(1, T, 1).type(dtype) win_start = math.floor(max(min(pred_start, min(original_frame_len, T)-1), 0)) win_end = math.ceil(max(min(pred_end, min(original_frame_len, T)), 1)) # if win_start >= win_end: # print('length: {}, mask window start: {} >= window end: {}, skipping'.format( # original_frame_len, win_start, win_end, # )) # continue pred_bin_window_mask[:, win_start:win_end] = 1 pred_masks.append(pred_bin_window_mask) if self.learn_mask: # 4, 5 are the indices for anchor length and center anc_len = crt_pred[4, sel_idx[prop_idx]] anc_cen = crt_pred[5, sel_idx[prop_idx]] # only use the pos sample to train, could potentially use more sample for training mask, but this is easier to do amask = torch.zeros(1,T).type(dtype) amask[0, max(0, math.floor(anc_cen - anc_len / 2.)): min(T, math.ceil(anc_cen + anc_len / 2.))] = 1. anchor_window_mask.append(amask) pred_start_lst.append(torch.Tensor([pred_start_w]).type(dtype)) pred_end_lst.append(torch.Tensor([pred_end_w]).type(dtype)) anchor_start_lst.append(torch.Tensor([max(0, math.floor( anc_cen - anc_len / 2.))]).type( dtype)) anchor_end_lst.append(torch.Tensor([min(T, math.ceil( anc_cen + anc_len / 2.))]).type( dtype)) gate_scores.append(torch.Tensor([crt_pred[0, sel_idx[prop_idx]]]).type(dtype)) pred_results[crt_nproposal] = np.array([win_start, win_end, crt_pred[0, sel_idx[prop_idx]]]) crt_nproposal += 1 if crt_nproposal >= max_prop_num: break if crt_nproposal >= min_prop_num: break mid1_t = time.time() if len(pred_masks) == 0: # append all-one window if no window is proposed pred_masks.append(torch.ones(1, T, 1).type(dtype)) pred_results.append((0, min(original_frame_len, T), pos_thresh)) crt_nproposal = 1 pred_masks = Variable(torch.cat(pred_masks, 0)) batch_x = x[b].unsqueeze(0).expand(pred_masks.size(0), x.size(1), x.size(2)) if self.learn_mask: pe_pred_start = torch.cat(pred_start_lst, 0) pe_pred_end = torch.cat(pred_end_lst, 0) pe_anchor_start = torch.cat(anchor_start_lst, 0) pe_anchor_end = torch.cat(anchor_end_lst, 0) pe_locs = torch.cat((pe_pred_start, pe_pred_end, pe_anchor_start, pe_anchor_end), 0) pos_encs = positional_encodings(pe_locs, self.d_model // 4) npos = pos_encs.size(0) anchor_window_mask = Variable(torch.cat(anchor_window_mask, 0)) in_pred_mask = torch.cat((pos_encs[:npos//4], pos_encs[npos//4:npos//4*2], pos_encs[npos//4 * 2:npos//4 * 3], pos_encs[npos//4 * 3:npos//4 * 4], anchor_window_mask), 1) pred_cont_masks = self.mask_model(in_pred_mask).unsqueeze(2) if gated_mask: gate_scores = Variable(torch.cat(gate_scores, 0).view(-1,1,1)) window_mask = (gate_scores * pred_masks + (1 - gate_scores) * pred_cont_masks) else: window_mask = pred_cont_masks else: window_mask = pred_masks mid2_t = time.time() pred_sentence = [] # use cap_batch as caption batch size cap_batch = math.ceil(480*256/T) for sent_i in range(math.ceil(window_mask.size(0)/cap_batch)): batch_start = sent_i*cap_batch batch_end = min((sent_i+1)*cap_batch, window_mask.size(0)) pred_sentence += self.cap_model.greedy(batch_x[batch_start:batch_end], window_mask[batch_start:batch_end], 20) pred_results = pred_results[:crt_nproposal] assert len(pred_sentence) == crt_nproposal, ( "number of predicted sentence and proposal does not match" ) for idx in range(len(pred_results)): batch_result.append((pred_results[idx][0], pred_results[idx][1], pred_results[idx][2], pred_sentence[idx])) all_proposal_results.append(tuple(batch_result)) end_t = time.time() print('Processing time for tIoU: {:.2f}, mask: {:.2f}, caption: {:.2f}'.format(mid1_t-start_t, mid2_t-mid1_t, end_t-mid2_t)) return all_proposal_results
def _get_pos_neg(split_path, annotations, vid, slide_window_size, sampling_sec, anc_len_all, anc_cen_all, pos_thresh, neg_thresh): if os.path.isfile(os.path.join(split_path, vid + '_bn.npy')): print('video: {}'.format(vid)) video_prefix = os.path.join(split_path, vid) # (T,2048) RGB resnet_feat = torch.from_numpy(np.load(video_prefix + '_resnet.npy')).float() # (T,1024) FLOW bn_feat = torch.from_numpy(np.load(video_prefix + '_bn.npy')).float() if resnet_feat.size(0) != bn_feat.size(0): raise Exception('number of frames does not match in feature!') total_frame = bn_feat.size(0) # 不太明白为什么乘sampling_sec window_start = 0 window_end = slide_window_size window_start_t = window_start * sampling_sec #这里相当于把窗口缩小 window_end_t = window_end * sampling_sec pos_seg = defaultdict(list) # 解决了dict中不存在默认值的问题,如果不存在键,默认为list neg_overlap = [0] * anc_len_all.shape[0] # 6338 pos_collected = [False] * anc_len_all.shape[0] # 6338 # 遍历所有预设的anchor for j in range(anc_len_all.shape[0]): potential_match = [] for ann_idx, ann in enumerate(annotations): seg = ann['segment'] gt_start = seg[0] / sampling_sec #相当于把gt的范围进行放大,预设的anchor是根据抽样后的时间长度预设的只有这样才和未抽样前相对应 gt_end = seg[1] / sampling_sec if gt_start > gt_end: gt_start, gt_end = gt_end, gt_start # 预设的anchor不能超过total_frame并且gt在窗口内 if anc_cen_all[j] + anc_len_all[j] / 2. <= total_frame: if window_start_t <= seg[0] and window_end_t + sampling_sec * 2 >= seg[1]: overlap = segment_iou(np.array([gt_start, gt_end]), np.array([[ anc_cen_all[j] - anc_len_all[j] / 2., anc_cen_all[j] + anc_len_all[j] / 2.]])) neg_overlap[j] = max(overlap, neg_overlap[j]) if not pos_collected[j] and overlap >= pos_thresh: len_offset = math.log( (gt_end - gt_start) / anc_len_all[j]) cen_offset = ((gt_end + gt_start) / 2. - anc_cen_all[j]) / anc_len_all[j] potential_match.append( (ann_idx, j, overlap, len_offset, cen_offset, ann['sentence_idx'])) pos_collected[j] = True # 把获取的正样本存储到pos_seg中 filled = False for item in potential_match: if item[0] not in pos_seg: filled = True pos_seg[item[0]].append(tuple(item[1:])) # {'ann_idx': ( j, overlap, len_offset, cen_offset, ann['sentence_idx'])} break if not filled and len(potential_match)>0: # randomly choose one shuffle(potential_match) item = potential_match[0] pos_seg[item[0]].append(tuple(item[1:])) # 某些gt_segments没有任何匹配的预设anchor missing_prop = 0 if len(pos_seg.keys()) != len(annotations): print('Some annotations in video {} does not have ' 'any matching proposal'.format(video_prefix)) missing_prop = len(annotations) - len(pos_seg.keys()) neg_seg = [] for oi, overlap in enumerate(neg_overlap): if overlap < neg_thresh: neg_seg.append((oi, overlap)) npos_seg = 0 for k in pos_seg: npos_seg += len(pos_seg[k]) print( 'pos anc: {}, neg anc: {}'.format(npos_seg, len(neg_seg))) return video_prefix, total_frame, pos_seg, neg_seg, missing_prop else: return None