def make_dataset(video_names, video_root): dataset = [] num_frames_collect = [] i = 0 for vid in video_names: video_path = os.path.join(video_root, vid) try: cap = VideoReader(video_path) except: print('Error in reading %s' % video_path) continue if not os.path.exists(video_path): print('Warning: %s not exist!' % video_path) continue num_frames = cap.length if num_frames < cfg.DATASET.CLIP_LEN: print('Skipping %s due to the short length %d.' % (video_path, num_frames)) continue dataset.append(video_path) num_frames_collect.append(num_frames) #print(vid, num_frames) i += 1 return dataset, num_frames_collect
def gen_mask_gt(video_list, video_root_path, annot_path, annot_out_path, video_suffix='avi'): mask_annot = annot.MaskAnnotation(annot_path) with open(video_list, 'r') as f: lines = f.readlines() vids = [line.strip() for line in lines] height, width = -1, -1 event_bbox = mask_annot.event_bbox if vids[0].split(".")[-1] in SUFFIX: with_suffix = True else: with_suffix = False for vid in vids: vname = vid + '.' + video_suffix if not with_suffix else vid video_path = os.path.join(video_root_path, vname) cap = VideoReader(video_path) if vname not in event_bbox: continue for frame in cap.get_iter(1): img = frame.numpy() assert (len(img.shape) > 1) height, width, _ = img.shape cur_bboxes = event_bbox[vname] for fid in cur_bboxes: img = np.zeros([height, width]) bboxes = cur_bboxes[fid] for bbox in bboxes: print(vid, fid, bbox, height, width) x1, y1, w, h = bbox img[y1:y1 + h, x1:x1 + w] = 255 save_path = os.path.join(annot_out_path, vid) if not os.path.isdir(save_path): os.makedirs(save_path) cv2.imwrite(os.path.join(save_path, "frame_%d_mask.png" % fid), img)
def get_video_reader(args, video_name): date, hr_slot, camera = parse_meva_clip_name(video_name) video_path = os.path.join(args.video_path, date, hr_slot, video_name + ".avi") if args.use_lijun_video_loader: vcap = VideoReader(video_path) frame_count = int(vcap.length) elif args.use_moviepy: vcap = VideoFileClip(video_path, audio=False) frame_count = int(vcap.fps * vcap.duration) # uh vcap = vcap.iter_frames() else: try: vcap = cv2.VideoCapture(video_path) if not vcap.isOpened(): raise Exception("cannot open %s" % video_path) except Exception as e: raise Exception("warning, cannot open %s" % video_path) # opencv 3/4 frame_count = vcap.get(cv2.CAP_PROP_FRAME_COUNT) # start reading frames into queues now video_queuer = VideoEnqueuer( args, vcap, frame_count, frame_gap=1, # no skipping frames prefetch=100, start=True, is_moviepy=args.use_moviepy, batch_size=1) get_frame_batches = video_queuer.get() return get_frame_batches, video_queuer.num_batches
def load_frames_from_video(video_path, start, num, stride=1): frames = [] cap = VideoReader(video_path) start_frame_id = start * stride video_len = cap.length length = num * stride if length > video_len - start_frame_id: start_frame_id = video_len - length cap.seek(start_frame_id) count = 0 for frame in cap.get_iter(length): if count % stride: count += 1 continue img = frame.numpy() assert (len(img.shape) > 1) img = img[:, :, [2, 1, 0]] h, w, c = img.shape #print('shape: w: %d, h: %d, c: %d' % (w, h, c)) if w < 226 or h < 226: d = 226. - min(w, h) sc = 1 + d / min(w, h) img = cv2.resize(img, dsize=(0, 0), fx=sc, fy=sc) img = (img / 255.) * 2 - 1 frames.append(img) count += 1 return np.asarray(frames, dtype=np.float32), start_frame_id
def test_and_save_mask(net, test_dataloader): clip_len = cfg.DATASET.CLIP_LEN clip_stride = cfg.DATASET.CLIP_STRIDE for sample in iter(test_dataloader): if cfg.TEST.WITH_MASK: vid, start_f, clips, vmask = sample else: vid, start_f, clips = sample if clips.size(2) < 8: continue # forward and get the prediction result vpred = net(to_cuda(clips)) # N x D x H x W probs = F.softmax(vpred, dim=1) pos_probs = probs[:, 1, :, :, :] start_f = start_f.numpy() N = len(vid) assert(N == len(start_f)) for i in range(N): cur_vid = vid[i] cur_video_path = os.path.join(cfg.DATASET.DATAROOT, 'videos', '%s.%s'%(cur_vid, cfg.DATASET.VIDEO_FORMAT)) print('video: %s, start_f: %d' % (cur_video_path, start_f[i])) # TODO: for debugging #if start_f[i] < 5344: # continue cur_video = VideoReader(cur_video_path) #cur_video.seek(int(start_f[i])) frame_count = 0 proposals = [] clip_imgs = [] #masks = [] #for frame in cur_video.get_iter(clip_len * clip_stride): # if frame_count % clip_stride: # frame_count += 1 # continue # # read the image # img = frame.numpy() # assert(len(img.shape) > 1) # clip_imgs += [img] # #img = img[:, :, [2, 1, 0]] # #img = (img / 255.) * 2 - 1 for fid in range(clip_len * clip_stride): if frame_count % clip_stride: frame_count += 1 continue count = frame_count // clip_stride if not cfg.TEST.WITH_DENSE_CRF: cur_pos_probs = pos_probs[i, count, :, :].cpu().numpy() else: cur_probs = probs[i, :, count, :, :].cpu().numpy() # TODO: need normalize or not? resized_img = clips[i, :, count, :, :].cpu().numpy() resized_img = np.uint8(255 * (resized_img + 1.) / 2.0) cur_pos_probs = 1.0 * dense_crf(cur_probs, resized_img) smoothing(cur_pos_probs) labels, num_regions = regional_growing(cur_pos_probs, pixel_val_thres=0.3) cur_pos_probs, bboxes = filtering(cur_pos_probs, labels, num_regions, 5) #masks.append(cur_pos_probs) if len(proposals) == 0: proposals = [[(count, bbox)] for bbox in bboxes] else: associate_bboxes(count, bboxes, proposals) frame_count += 1 #heatmaps = draw_heatmaps(clip_imgs, masks) #save_visualizations(heatmaps, 'heatmaps', cur_vid, start_f[i]) #h, w, _ = clip_imgs[0].shape h, w = cur_video.height, cur_video.width new_proposals = [prop for prop in proposals if len(prop) >= 7] print('Number of proposals before and after filtering: %d, %d' % (len(proposals), len(new_proposals))) if len(new_proposals) == 0: continue #for prop in new_proposals: # print(prop) stride_x = 1.0 * w / cfg.DATA_TRANSFORM.FINESIZE stride_y = 1.0 * h / cfg.DATA_TRANSFORM.FINESIZE new_proposals = resize_proposals(new_proposals, stride_x, stride_y, w, h) save_proposals(new_proposals, 'proposals', cur_vid, start_f[i])
tfconfig = tf.ConfigProto(allow_soft_placement=True) if not args.use_all_mem: tfconfig.gpu_options.allow_growth = True tfconfig.gpu_options.visible_device_list = "%s" % (",".join([ "%s" % i for i in range(args.gpuid_start, args.gpuid_start + args.gpu) ])) with tf.Session(config=tfconfig) as sess: if not args.is_load_from_pb: initialize(config=args, sess=sess) for videofile in tqdm(videolst, ascii=True): # 2. read the video file if args.use_lijun_video_loader: vcap = VideoReader(videofile) frame_count = int(vcap.length) elif args.use_moviepy: vcap = VideoFileClip(videofile, audio=False) frame_count = int(vcap.fps * vcap.duration) # uh vcap = vcap.iter_frames() else: try: vcap = cv2.VideoCapture(videofile) if not vcap.isOpened(): raise Exception("cannot open %s" % videofile) except Exception as e: # raise e # just move on to the next video print("warning, cannot open %s" % videofile) continue
def get_vid_meta(vid, vid_meta): video_path = os.path.join(cfg.DATASET.DATAROOT, 'videos', '%s.%s' % (vid, cfg.DATASET.VIDEO_FORMAT)) video = VideoReader(video_path) h, w = video.height, video.width vid_meta[vid] = (h, w)
if args.use_2level: targetpath = os.path.join(args.despath, videoname) if not os.path.exists(targetpath): os.makedirs(targetpath) if args.name_level is not None: foldernames = video.split("/") prefixes = foldernames[-1 - args.name_level:-1] videoname = "__".join(prefixes + [videoname]) if args.use_moviepy: vcap = VideoFileClip(video, audio=False) frame_count = int(vcap.fps * vcap.duration) # uh vcap_iter = vcap.iter_frames() elif args.use_lijun: vcap = VideoReader(video) frame_count = int(vcap.length) else: try: vcap = cv2.VideoCapture(video) if not vcap.isOpened(): raise Exception("cannot open %s" % video) except Exception as e: raise e if cv2.__version__.split(".") != "2": frame_width = vcap.get(cv2.CAP_PROP_FRAME_WIDTH) frame_height = vcap.get(cv2.CAP_PROP_FRAME_HEIGHT) fps = vcap.get(cv2.CAP_PROP_FPS) frame_count = vcap.get(cv2.CAP_PROP_FRAME_COUNT)
def test_and_save_mask(cfg, net, vids, test_transforms): clip_len = cfg.DATASET.CLIP_LEN clip_stride = cfg.DATASET.CLIP_STRIDE net.eval() for vid in vids: cur_video_path = vid cur_vid = os.path.split(cur_video_path)[-1] prop_id = 0 cur_video = VideoReader(cur_video_path) video_len = cur_video.length last_start_f = (video_len // (clip_len * clip_stride)) * (clip_len * clip_stride) if last_start_f == video_len: last_start_f = -1 last_clip_len = -1 else: last_clip_len = video_len - last_start_f print('video_len: %d, last_start_f: %d, last_clip_len: %d' % (video_len, last_start_f, last_clip_len)) f_count = 0 clip_imgs = [] start_f = 0 for frame in cur_video: f_count += 1 clip_imgs.append(frame.numpy()) if len(clip_imgs) < clip_len * clip_stride: continue start_f = f_count - (clip_len * clip_stride) print('video: %s, start_f: %d' % (cur_video_path, start_f)) clips = processing_frames(clip_imgs[0::clip_stride], test_transforms) clips = video_to_tensor(clips).unsqueeze(0) assert (len(clips.size()) == 5), clips.size() assert (clips.size(1) == 3), clips.size(1) assert (clips.size(2) == clip_len), clips.size(2) # forward and get the prediction result vpred = net(to_cuda(clips)) probs = F.softmax(vpred, dim=1) pos_probs = probs[:, 1, :, :, :] # generate and save proposals proposals = [] for count in range(clip_len): cur_pos_probs = pos_probs[0, count, :, :].cpu().numpy() # to speedup, first downsample the probability map resized_cur_pos_probs = cv2.resize( cur_pos_probs, dsize=(0, 0), fx=0.5, fy=0.5, interpolation=cv2.INTER_LINEAR) #smoothing(resized_cur_pos_probs, len_thres=2) labels, num_regions = regional_growing(resized_cur_pos_probs, pixel_val_thres=0.3) # upsample the label map ori_h, ori_w = cur_pos_probs.shape labels = cv2.resize(labels, dsize=(ori_w, ori_h), interpolation=cv2.INTER_NEAREST) cur_pos_probs, bboxes = filtering(cur_pos_probs, labels, num_regions, 5) if len(proposals) == 0: proposals = [[(count, bbox)] for bbox in bboxes] else: associate_bboxes(count, bboxes, proposals) h, w = cur_video.height, cur_video.width new_proposals = [prop for prop in proposals if len(prop) >= 7] print('Number of proposals before and after filtering: %d, %d' % (len(proposals), len(new_proposals))) if len(new_proposals) == 0: save_path = os.path.join(cfg.SAVE_DIR, 'proposals', cur_vid) if not os.path.exists(save_path): os.makedirs(save_path) with open(os.path.join(save_path, 'props.txt'), 'a') as f: f.close() else: stride_x = 1.0 * w / cfg.DATA_TRANSFORM.FINESIZE stride_y = 1.0 * h / cfg.DATA_TRANSFORM.FINESIZE new_proposals = resize_proposals(new_proposals, stride_x, stride_y, w, h) prop_id = save_proposals(cfg, clip_imgs, new_proposals, 'proposals', cur_vid, start_f, prop_id) if start_f == (last_start_f - clip_len * clip_stride): clip_imgs = clip_imgs[last_clip_len:] else: clip_imgs = []
def load_track_and_features(args, video_name, p_file, v_file, p_extractor, v_extractor, hs): date, hr_slot, camera = parse_meva_clip_name(video_name) # start loading video_frames first video_path = os.path.join(args.video_path, date, hr_slot, video_name + ".avi") if args.use_lijun_video_loader: vcap = VideoReader(video_path) frame_count = int(vcap.length) elif args.use_moviepy: vcap = VideoFileClip(video_path, audio=False) frame_count = int(vcap.fps * vcap.duration) # uh vcap = vcap.iter_frames() else: try: vcap = cv2.VideoCapture(video_path) if not vcap.isOpened(): raise Exception("cannot open %s" % video_path) except Exception as e: raise Exception("warning, cannot open %s" % video_path) # opencv 3/4 frame_count = vcap.get(cv2.CAP_PROP_FRAME_COUNT) # start reading frames into queues now video_queuer = VideoEnqueuer( args, vcap, frame_count, frame_gap=1, # no skipping frames prefetch=100, start=True, is_moviepy=args.use_moviepy, batch_size=1) get_frame_batches = video_queuer.get() def load_track_file(file_path, homography): """load a tracking file into dict of numpy arrays.""" # assuming sorted by frameid data = [] with open(file_path, "r") as f: for line in f: frame_idx, track_id, left, top, width, height, conf, _, _, _ = line.strip().split(",") data.append([frame_idx, track_id, left, top, width, height, conf]) if not data: return {} data = np.array(data, dtype="float32") # [N, 7] # compute topdown points foot_points_x = data[:, 2] + data[:, 4] / 2. # [N] foot_points_y = data[:, 3] + data[:, 5] foot_points = np.stack([foot_points_x, foot_points_y], axis=0) # [2, N] # [2, N] top_down_points = warp_points(foot_points, homography) top_down_points = np.transpose(top_down_points, [1, 0]) # [N, 2] # [N, 9] data = np.concatenate([data, top_down_points], axis=1) track_ids = np.unique(data[:, 1]).tolist() track_data = {} # [num_track, K, 9] for track_id in track_ids: track_data[track_id] = data[data[:, 1] == track_id, :] return track_data # track_id -> data p_tracks = load_track_file(p_file, hs[camera]) v_tracks = load_track_file(v_file, hs[camera]) # get each frame's boxes to extract frame_data = {} # frame_idx -> a list of boxes, def get_track_boxes(tracks, cat_name): for track_id in tracks: idxs = list(range(0, len(tracks[track_id]), args.feature_box_gap)) idxs = idxs[:args.feature_box_num] boxes = tracks[track_id][idxs, :] # [k, 7] for box_idx, box in enumerate(boxes): frame_idx = box[0] tlwh = box[2:6] if not frame_idx in frame_data: frame_data[frame_idx] = [] frame_data[frame_idx].append((tlwh, track_id, box_idx, cat_name)) get_track_boxes(p_tracks, "Person") get_track_boxes(v_tracks, "Vehicle") # 2. go through the video once and crop all the images to extract features # assuming not conflict between person/vehicle track_id p_track_to_feat = {} # "track_id" => features v_track_to_feat = {} # "track_id" => features for batch in tqdm(get_frame_batches, total=video_queuer.num_batches): image, scale, frame_idx = batch[0] image = image.astype("uint8") # need uint8 type if frame_idx in frame_data: for tlwh, track_id, box_idx, cat_name in frame_data[frame_idx]: # check box valid if valid_box(tlwh, image): x, y, w, h = tlwh x, y, w, h = int(x), int(y), int(w), int(h) #print(x, y, w, h) #print(image[y:y+h, x:x+w]) box_img = cv2.cvtColor( image[y:y+h, x:x+w], cv2.COLOR_BGR2RGB) if cat_name == "Person": if track_id not in p_track_to_feat: p_track_to_feat[track_id] = [] p_track_to_feat[track_id].append(box_img) elif cat_name == "Vehicle": if track_id not in v_track_to_feat: v_track_to_feat[track_id] = [] v_track_to_feat[track_id].append(box_img) # extract features def get_features(track_to_imgs, extractor): for track_id in track_to_imgs: box_imgs = track_to_imgs[track_id] track_to_imgs[track_id] = extractor(box_imgs).cpu().numpy() # [K, 512] if args.use_avg_pool: # [1, 512] track_to_imgs[track_id] = np.mean( track_to_imgs[track_id], axis=0, keepdims=True) get_features(p_track_to_feat, p_extractor) get_features(v_track_to_feat, v_extractor) data = {} def gather_data(track_data, track_features, cat_name): data[cat_name] = {} for track_id in track_data: # ignore track with no valid boxes if track_id in track_features: data[cat_name][track_id] = ( track_data[track_id], track_features[track_id]) gather_data(p_tracks, p_track_to_feat, "Person") gather_data(v_tracks, v_track_to_feat, "Vehicle") return data