def _parse_prop_file(self, stats=None): print('loading prop_file ' + self.prop_file) prop_info = load_proposal_file(self.prop_file) self.video_list = [VideoRecord(p, self.num_classes) for p in prop_info] print('max number of proposal in one video is %d' % max([len(v.proposals) for v in self.video_list])) print('create video list') # empty proposal problem starts if self.exclude_empty and not self.test_mode: self.video_list = list(filter(lambda x: len(x.gt) > 0, self.video_list)) self.video_dict = {v.id: v for v in self.video_list} if not self.test_mode: # construct three pools: # 1. Foreground # 2. Background # 3. Incomplete self.fg_pool = [] self.bg_pool = [] self.incomp_pool = [] for v in self.video_list: self.fg_pool.extend([(v.id, prop) for prop in v.get_fg(self.fg_iou_thresh, self.gt_as_fg)]) incomp, bg = v.get_negatives(self.incomplete_iou_thresh, self.bg_iou_thresh, self.bg_coverage_thresh, self.incomplete_overlap_thresh) self.incomp_pool.extend([(v.id, prop) for prop in incomp]) self.bg_pool.extend([(v.id, prop) for prop in bg]) if stats is None: self._compute_regresssion_stats() else: self.stats = stats
def _parse_prop_file(self, stats=None): prop_info = load_proposal_file(self.prop_file) self.video_list = [SSNVideoRecord(p) for p in prop_info] if self.exclude_empty: self.video_list = list(filter(lambda x: len(x.gt) > 0, self.video_list)) self.video_dict = {v.id: v for v in self.video_list} # construct three pools: # 1. Foreground # 2. Background # 3. Incomplete self.fg_pool = [] self.bg_pool = [] self.incomp_pool = [] for v in self.video_list: self.fg_pool.extend([(v.id, prop) for prop in v.get_fg(self.fg_iou_thresh, self.gt_as_fg)]) incomp, bg = v.get_negatives(self.incomplete_iou_thresh, self.bg_iou_thresh, self.bg_coverage_thresh, self.incomplete_overlap_thresh) self.incomp_pool.extend([(v.id, prop) for prop in incomp]) self.bg_pool.extend([(v.id, prop) for prop in bg]) if stats is None: self._compute_regresssion_stats() else: self.stats = stats if self.verbose: print(""" SSNDataset: Proposal file {prop_file} parsed. There are {pnum} usable proposals from {vnum} videos. {fnum} foreground proposals {inum} incomplete_proposals {bnum} background_proposals Sampling config: FG/BG/INC: {fr}/{br}/{ir} Video Centric: {vc} Epoch size multiplier: {em} Regression Stats: Location: mean {stats[0][0]:.05f} std {stats[1][0]:.05f} Duration: mean {stats[0][1]:.05f} std {stats[1][1]:.05f} """.format(prop_file=self.prop_file, pnum=len(self.fg_pool) + len(self.bg_pool) + len(self.incomp_pool), fnum=len(self.fg_pool), inum=len(self.incomp_pool), bnum=len(self.bg_pool), fr=self.fg_per_video, br=self.bg_per_video, ir=self.incomplete_per_video, vnum=len(self.video_dict), vc=self.video_centric, stats=self.stats, em=self.epoch_multiplier)) else: print(""" SSNDataset: Proposal file {prop_file} parsed. """.format(prop_file=self.prop_file))
def _parse_prop_file(self, stats=None): prop_info = load_proposal_file(self.prop_file, self.mode) self.video_list = [VideoRecord(p, self.mode) for p in prop_info] if self.exclude_empty: self.video_list = list(filter(lambda x: len(x.gt) > 0, self.video_list)) self.video_dict = {v.id: v for v in self.video_list} # construct three pools: # 1. Foreground # 2. Background # 3. Incomplete self.fg_pool = [] self.bg_pool = [] self.incomp_pool = [] # 注意:是所有视频的props放在一起的 for v in self.video_list: self.fg_pool.extend([(v.id, prop) for prop in v.get_fg(self.fg_iou_thresh, self.gt_as_fg)]) # if add gt into `FG` incomp, bg = v.get_negatives(self.incomplete_iou_thresh, self.bg_iou_thresh, self.bg_coverage_thresh, self.incomplete_overlap_thresh) self.incomp_pool.extend([(v.id, prop) for prop in incomp]) self.bg_pool.extend([(v.id, prop) for prop in bg]) if stats is None: self._compute_regresssion_stats() else: self.stats = stats
def _parse_prop_file(self): prop_info = load_proposal_file(self.prop_file) self.video_list = [BinaryVideoRecord(p) for p in prop_info] if self.exclude_empty: self.video_list = list( [x for x in self.video_list if len(x.gt) > 0]) self.video_dict = {v.id: v for v in self.video_list} # construct two pools: # 1. Foreground # 2. Background self.fg_pool = [] self.bg_pool = [] for v in self.video_list: self.fg_pool.extend([ (v.id, prop) for prop in v.get_fg(self.fg_iou_thresh, self.gt_as_fg) ]) self.bg_pool.extend([(v.id, prop) for prop in v.get_bg(self.bg_iou_thresh)]) if self.verbose: print((""" BinaryDataSet: Proposal file {prop_file} parse. There are {pnum} usable proposals from {vnum} videos. {fnum} foreground proposals {bnum} background proposals Sampling config: FG/BG: {fr}/{br} Epoch size muiltiplier: {em} """.format( prop_file=self.prop_file, pnum=len(self.fg_pool) + len(self.bg_pool), fnum=len(self.fg_pool), bnum=len(self.bg_pool), fr=self.fg_per_video, br=self.bg_per_video, vnum=len(self.video_dict), em=self.epoch_multiplier, ))) else: print((""" BinaryDataset: proposal file {prop_file} parsed. """.format(prop_file=self.prop_file)))
def _parse_prop_file(self, stats=None): # 返回一个长为199的数组,里面的每个元素为一个tuple # 每个tuple内包含以下信息: (vid,n_frames,gt_boxes,pr_boxes) prop_info = load_proposal_file( self.prop_file) # return vid, n_frame, gt_boxes, pr_boxes # 生成每个视频对应的videorecord对象(每个record对象里又包含一个instance对象),之所以要转成videorecord是为了后续调用record里的方法构建三种pool self.video_list = [PGCNVideoRecord(p) for p in prop_info] if self.exclude_empty: self.video_list = list( filter(lambda x: len(x.gt) > 0, self.video_list)) # 过滤掉没有gt的视频 # 将video_list转为video_dict self.video_dict = {v.id: v for v in self.video_list} # construct three pools: # 1. Foreground # 2. Background # 3. Incomplete self.fg_pool = [] self.bg_pool = [] self.incomp_pool = [] for v in self.video_list: # 构建fg_pool,根据阈值获取满足条件的prop # fg_iou_thresh : 0.7 # 每个提议包含以下信息 # (label,best_iou,coverage,start_frame,end_frame,overlap_self,loc_reg,size_reg) self.fg_pool.extend([ (v.id, prop) for prop in v.get_fg(self.fg_iou_thresh, self.gt_as_fg) ]) # 构建bg_pool和incomp_pool # incomplete_iou_thresh : 0.3 # bg_iou_thresh : 0.01 # bg_coverage_thresh : 0.02 # incomplete_overlap_thresh : 0.01 incomp, bg = v.get_negatives(self.incomplete_iou_thresh, self.bg_iou_thresh, self.bg_coverage_thresh, self.incomplete_overlap_thresh) self.incomp_pool.extend([(v.id, prop) for prop in incomp]) self.bg_pool.extend([(v.id, prop) for prop in bg]) if stats is None: self._compute_regresssion_stats() # 计算所有回归的均值和标准差 else: self.stats = stats
def _parse_prop_file(self, stats=None): prop_info = load_proposal_file( self.prop_file) # 读取proposal文件 # video_name, 该视频的帧数, gt, proposals self.video_list = [PGCNVideoRecord(p) for p in prop_info ] # 处理每一个video的信息,每个video都含有很多proposal if self.exclude_empty: self.video_list = list( filter(lambda x: len(x.gt) > 0, self.video_list)) self.video_dict = {v.id: v for v in self.video_list} # construct three pools: pool的元素(videoname, proposal) # 1. Foreground # 2. Background # 3. Incomplete self.fg_pool = [] self.bg_pool = [] self.incomp_pool = [] for v in self.video_list: # for each video self.fg_pool.extend([ (v.id, prop) for prop in v.get_fg(self.fg_iou_thresh, self.gt_as_fg) ]) # 得到前景 incomp, bg = v.get_negatives( self.incomplete_iou_thresh, self.bg_iou_thresh, # 选择背景与不完整片段 self.bg_coverage_thresh, self.incomplete_overlap_thresh) self.incomp_pool.extend([(v.id, prop) for prop in incomp]) self.bg_pool.extend([(v.id, prop) for prop in bg]) if stats is None: self._compute_regresssion_stats() # 计算均值与方差 else: self.stats = stats
def __init__(self, feat_root, feat_model, prop_file=None, subset_videos=None, body_seg=5, video_centric=True, test_mode=False, feat_stride=16, input_dim=1024, prop_per_video=12, fg_ratio=6, bg_ratio=6, fg_iou_thresh=0.7, bg_iou_thresh=0.01, bg_coverage_thresh=0.02, sample_duration=100*16, gt_as_fg=True, test_interval=6, verbose=True, exclude_empty=True, epoch_multiplier=1, use_flow=True, only_flow=False, num_local=8, frame_path='../../data/activitynet/activity_net_frames'): self.verbose = verbose self.num_local = num_local self.body_seg = body_seg self.video_centric = video_centric self.exclude_empty = exclude_empty self.epoch_multiplier = epoch_multiplier self.input_dim = input_dim self.feat_stride = feat_stride assert feat_stride % 8 == 0 self.sample_duration = sample_duration // feat_stride self.test_mode = test_mode self.test_interval = test_interval self.fg_iou_thresh = fg_iou_thresh self.bg_iou_thresh = bg_iou_thresh self.bg_coverage_thresh = bg_coverage_thresh self.starting_ratio = 0.5 self.ending_ratio = 0.5 self.gt_as_fg = gt_as_fg denum = fg_ratio + bg_ratio self.fg_per_video = int(prop_per_video * (fg_ratio / denum)) self.bg_per_video = int(prop_per_video * (bg_ratio / denum)) # set the directory for the optical-flow features if feat_model.endswith('_trained'): feat_flow_rpath = os.path.join(feat_root, 'i3d_flow_trained') else: feat_flow_rpath = os.path.join(feat_root, 'i3d_flow') print("using flow feature from {}".format(feat_flow_rpath)) # obatin the h5 feature directory flow_h5_path = os.path.join(feat_flow_rpath, 'i3d_flow_feature.hdf5') flow_feat_key = 'i3d_flow_feature' feat_rgb_path = os.path.join(feat_root, feat_model) if feat_model == 'i3d_rgb' or feat_model == 'i3d_rgb_trained': rgb_h5_path = os.path.join(feat_rgb_path, 'i3d_rgb_feature.hdf5') rgb_feat_key = 'i3d_rgb_feature' elif feat_model == 'inception_resnet_v2': rgb_h5_path = os.path.join( feat_rgb_path, 'new_inception_resnet.hdf5') rgb_feat_key = 'inception_resnet_v2' elif feat_model == 'inception_resnet_v2_trained': rgb_h5_path = os.path.join( feat_rgb_path, 'inception_resnet_v2_trained.hdf5') rgb_feat_key = 'inception_resnet_v2' else: raise NotImplementedError('this feature has been extracted !') print("using rgb feature from {}".format(rgb_h5_path)) if prop_file: prop_info = load_proposal_file(prop_file) frame_counts = {} for i, vid_info in enumerate(prop_info): vid_name = os.path.split(vid_info[0])[1] frame_counts[vid_name] = int(vid_info[1]) else: frame_counts = None self.video_list = [BinaryVideoRecord(x, frame_path, flow_h5_path, rgb_h5_path, flow_feat_key, rgb_feat_key, frame_counts, use_flow=use_flow, only_flow=only_flow, feat_stride=feat_stride, sample_duration=self.sample_duration) for x in subset_videos]