def __init__( self, root_path="data/BSLCP", inp_res=224, resize_res=256, setname="train", scale_factor=0.1, num_in_frames=16, evaluate_video=False, hflip=0.5, stride=0.5, gpu_collation=False, word_data_pkl=None, featurize_mask="", featurize_mode=False, ): self.root_path = root_path self.setname = setname # train, val or test self.featurize_mode = featurize_mode self.featurize_mask = featurize_mask self.gpu_collation = gpu_collation self.inp_res = inp_res self.resize_res = resize_res self.scale_factor = scale_factor self.num_in_frames = num_in_frames self.evaluate_video = evaluate_video self.hflip = hflip self.stride = stride infofile = os.path.join(root_path, "info/info.pkl") self.video_folder = "videos-resized-25fps-256x256-signdict_signbank" print(f"Loading {infofile}") data = pkl.load(open(infofile, "rb")) self.set_video_metadata(data, meta_key="videos", fixed_sz_frames=gpu_collation) self.set_class_names(data=data, word_data_pkl=word_data_pkl) self.train = list( np.where(np.asarray(data["videos"]["split"]) == 0)[0]) self.valid = list( np.where(np.asarray(data["videos"]["split"]) == 2)[0]) self.videos = [s.strip() for s in data["videos"]["name"]] if evaluate_video: self.valid, self.t_beg = self._slide_windows(self.valid) VideoDataset.__init__(self)
def __init__( self, info_pkl_json="misc/bsl1k/info-pkls.json", inp_res=224, resize_res=256, setname="train", scale_factor=0.1, num_in_frames=16, evaluate_video=False, hflip=0.5, stride=0.5, mouthing_prob_thres=0.9, gpu_collation=False, num_last_frames=20, featurize_mode=False, featurize_mask="", word_data_pkl=None, input_type="rgb", pose_keys=["body", "face", "lhnd", "rhnd"], mask_rgb=None, mask_type=None, bsl1k_pose_subset=False, bsl1k_anno_key="original-mouthings", ): self.setname = setname # train, val or test self.featurize_mode = featurize_mode self.featurize_mask = featurize_mask self.gpu_collation = gpu_collation self.inp_res = inp_res self.resize_res = resize_res self.scale_factor = scale_factor self.num_in_frames = num_in_frames self.evaluate_video = evaluate_video self.hflip = hflip self.stride = stride self.input_type = input_type self.pose_keys = pose_keys self.mask_rgb = mask_rgb self.mask_type = mask_type assert self.num_in_frames == 16 self.num_last_frames = num_last_frames print(f"Using only {self.num_last_frames} last frames of videos") with open(info_pkl_json, "r") as f: pkls = json.load(f)[bsl1k_anno_key] infofile = pkls["info"] self.video_folder = pkls["videos"] print(f"Loading {infofile}") data = pkl.load(open(infofile, "rb")) if self.input_type == "pose": pose_pkl = pkls["pose"] print(f"Loading {pose_pkl}") self.pose_data = pkl.load(open(pose_pkl, "rb")) if self.mask_rgb: assert bsl1k_pose_subset assert mask_type if self.mask_rgb == "face": face_pkl = pkls["face_bbox"] print(f"Loading {face_pkl}") self.face_data = pkl.load(open(face_pkl, "rb")) if bsl1k_pose_subset: # self.mask_rgb: mouth_pkl = pkls["mouth_bbox"] print(f"Loading {mouth_pkl}") self.mouth_data = pkl.load(open(mouth_pkl, "rb")) self.set_video_metadata(data, meta_key="videos", fixed_sz_frames=gpu_collation) subset_ix = self.set_class_names(data=data, word_data_pkl=word_data_pkl) self.train = list( np.where(np.asarray(data["videos"]["split"]) == 0)[0]) # train self.valid = list( np.where(np.asarray(data["videos"]["split"]) == 2)[0]) # test self.videos = [s.strip() for s in data["videos"]["name"]] # Take subsets based on 'mouthing_prob' confident_mouthing = np.where( np.asarray(data["videos"]["mouthing_prob"]) >= mouthing_prob_thres )[0] msg = ( f"Keeping {len(confident_mouthing)}/{len(data['videos']['mouthing_prob'])} " f"videos with more than {mouthing_prob_thres} mouthing confidence." ) print(msg) self.train = [i for i in self.train if i in confident_mouthing] self.valid = [i for i in self.valid if i in confident_mouthing] print("Taking subsets according to word vocab") self.train = list(set(self.train).intersection(set(subset_ix))) self.valid = list(set(self.valid).intersection(set(subset_ix))) if self.input_type == "pose": valid_pose_ix = np.where( np.array([i is not None for i in self.pose_data["pose"]]))[0] print(f"{len(self.train)} train, {len(self.valid)} val samples.") print("Taking subsets according to having pose or not") self.train = list(set(self.train).intersection(set(valid_pose_ix))) self.valid = list(set(self.valid).intersection(set(valid_pose_ix))) print(f"{len(self.train)} train, {len(self.valid)} val samples.") if bsl1k_pose_subset: # self.mask_rgb: # Valid mouth ix should be equivalent to valid face ix, so leaving this bit. valid_mouth_ix = np.where( np.array([i is not None for i in self.mouth_data]))[0] print(f"{len(self.train)} train, {len(self.valid)} val samples.") print("Taking subsets according to having pose or not") self.train = list( set(self.train).intersection(set(valid_mouth_ix))) self.valid = list( set(self.valid).intersection(set(valid_mouth_ix))) print(f"{len(self.train)} train, {len(self.valid)} val samples.") # Take a subset for validation if too large if self.setname == "val" and len(self.valid) > 1300: self.valid = self.valid[::int(len(self.valid) / 1300)] if evaluate_video: self.valid, self.t_beg = self._slide_windows(self.valid) VideoDataset.__init__(self)
def __init__( self, root_path="data/PHOENIX-2014-T-release-v3/PHOENIX-2014-T", inp_res=224, resize_res=256, setname="train", scale_factor=0.1, num_in_frames=16, evaluate_video=False, hflip=0.5, stride=0.5, gpu_collation=False, assign_labels="auto", ): self.root_path = root_path self.setname = setname # train, val or test self.gpu_collation = gpu_collation self.inp_res = inp_res self.resize_res = resize_res self.scale_factor = scale_factor self.num_in_frames = num_in_frames self.evaluate_video = evaluate_video self.hflip = hflip self.stride = stride self.assign_labels = assign_labels infofile = os.path.join(root_path, "info", "info.pkl") print(f"Loading {infofile}") data = pkl.load(open(infofile, "rb")) self.videos = [s.strip() for s in data["videos"]["name"]] other_class_ix = 1232 self.classes = data["videos"]["gloss_ids"] replace_cnt = 0 for i, seq in enumerate(self.classes): for j, gid in enumerate(seq): if gid == -1: replace_cnt += 1 self.classes[i][j] = other_class_ix print(f"Replaced {replace_cnt} -1s with {other_class_ix}") with open(os.path.join(self.root_path, "info", "words.txt"), "r") as f: self.class_names = f.read().splitlines() self.class_names.append("1232 __OTHER__") self.video_folder = "videos" meta_key = self.video_folder if gpu_collation: # GPU collation requires all inputs to share the same spatial input size self.video_folder = "videos-resized-256fps-256x256" self.set_video_metadata(data, meta_key=meta_key, fixed_sz_frames=gpu_collation) self.train = list(np.where(np.asarray(data["videos"]["split"]) == 0)[0]) if self.setname == "val": self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 1)[0]) elif self.setname == "test": self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 2)[0]) if self.assign_labels == "auto": self.frame_level_glosses = data["videos"]["alignments"]["gloss_id"] if evaluate_video: self.valid, self.t_beg = self._slide_windows(self.valid) VideoDataset.__init__(self)
def __init__( self, root_path="data/wlasl", inp_res=224, resize_res=256, setname="train", scale_factor=0.1, num_in_frames=64, evaluate_video=False, hflip=0.5, stride=0.5, ram_data=True, gpu_collation=False, use_bbox=True, monolithic_pkl_path="data/pickled-videos/wlasl-compressed-quality-90-resized-256x256.pkl", input_type="rgb", pose_keys=["body", "face", "lhnd", "rhnd"], mask_rgb=None, mask_type=None, mask_prob=1.0, ): self.root_path = root_path self.setname = setname # train, val or test self.inp_res = inp_res self.resize_res = resize_res self.scale_factor = scale_factor self.num_in_frames = num_in_frames self.evaluate_video = evaluate_video self.hflip = hflip self.gpu_collation = gpu_collation self.stride = stride self.use_bbox = use_bbox self.input_type = input_type self.pose_keys = pose_keys self.mask_rgb = mask_rgb self.mask_type = mask_type self.video_folder = "videos_360h_25fps" if Path(monolithic_pkl_path).exists() and ram_data: print(f"Loading monolithic pickle from {monolithic_pkl_path}") self.video_data_dict = memcache(monolithic_pkl_path) else: self.video_data_dict = None infofile = os.path.join(root_path, "info", "info.pkl") print(f"Loading {infofile}") data = pkl.load(open(infofile, "rb")) if self.input_type == "pose": pose_pkl = os.path.join(root_path, "info", "pose.pkl") print(f"Loading {pose_pkl}") self.pose_data = pkl.load(open(pose_pkl, "rb")) if self.mask_rgb: assert mask_type if self.mask_rgb == "face": face_pkl = os.path.join(root_path, "info", "face_bbox.pkl") print(f"Loading {face_pkl}") self.face_data = pkl.load(open(face_pkl, "rb")) # Use this to take subset if self.input_type == "pose" or self.mask_rgb: mouth_pkl = os.path.join(root_path, "info", "mouth_bbox.pkl") print(f"Loading {mouth_pkl}") self.mouth_data = pkl.load(open(mouth_pkl, "rb")) self.videos = [s.strip() for s in data["videos"]["name"]] self.videos = np.asarray(self.videos) self.classes = data["videos"]["word_id"] with open(os.path.join(self.root_path, "info", "words.txt"), "r") as f: self.class_names = f.read().splitlines() meta_key = self.video_folder if gpu_collation and not self.video_data_dict: # GPU collation requires all inputs to share the same spatial input size self.video_folder = "videos-resized-256fps-256x256" self.set_video_metadata(data, meta_key=meta_key, fixed_sz_frames=gpu_collation) bboxes_orig = [s for s in np.asarray(data["videos"]["box"])] self.bboxes = [] for i, bb in enumerate(bboxes_orig): ht = data["videos"]["videos_original"]["H"][i] wt = data["videos"]["videos_original"]["W"][i] xmin, ymin, xmax, ymax = bb bb_norm = [ymin / ht, xmin / wt, ymax / ht, xmax / wt] self.bboxes.append(bb_norm) self.train = list(np.where(np.asarray(data["videos"]["split"]) == 0)[0]) if self.setname == "val": self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 1)[0]) elif self.setname == "test": self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 2)[0]) if self.input_type == "pose" or self.mask_rgb: # Valid mouth ix should be equivalent to valid face ix, valid pose ix etc valid_mouth_ix = np.where( np.array([i is not None for i in self.mouth_data]) )[0] if self.setname == "val" or self.setname == "test": print(f"{len(self.train)} train, {len(self.valid)} val samples.") print("Taking subsets according to having pose or not") self.train = list(set(self.train).intersection(set(valid_mouth_ix))) if self.setname == "val" or self.setname == "test": self.valid = list(set(self.valid).intersection(set(valid_mouth_ix))) print(f"{len(self.train)} train, {len(self.valid)} val samples.") if evaluate_video: self.valid, self.t_beg = self._slide_windows(self.valid) VideoDataset.__init__(self)