def __init__( self, root_path="data/BSLCP", inp_res=224, resize_res=256, setname="train", scale_factor=0.1, num_in_frames=16, evaluate_video=False, hflip=0.5, stride=0.5, gpu_collation=False, word_data_pkl=None, featurize_mask="", featurize_mode=False, ): self.root_path = root_path self.setname = setname # train, val or test self.featurize_mode = featurize_mode self.featurize_mask = featurize_mask self.gpu_collation = gpu_collation self.inp_res = inp_res self.resize_res = resize_res self.scale_factor = scale_factor self.num_in_frames = num_in_frames self.evaluate_video = evaluate_video self.hflip = hflip self.stride = stride infofile = os.path.join(root_path, "info/info.pkl") self.video_folder = "videos-resized-25fps-256x256-signdict_signbank" print(f"Loading {infofile}") data = pkl.load(open(infofile, "rb")) self.set_video_metadata(data, meta_key="videos", fixed_sz_frames=gpu_collation) self.set_class_names(data=data, word_data_pkl=word_data_pkl) self.train = list( np.where(np.asarray(data["videos"]["split"]) == 0)[0]) self.valid = list( np.where(np.asarray(data["videos"]["split"]) == 2)[0]) self.videos = [s.strip() for s in data["videos"]["name"]] if evaluate_video: self.valid, self.t_beg = self._slide_windows(self.valid) VideoDataset.__init__(self)
def get_training_data(video_path, annotation_path, dataset_name, input_type, file_type, spatial_transform=None, temporal_transform=None, target_transform=None): assert dataset_name in [ 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'somethingv2', 'somethingv1' ] assert input_type in ['rgb', 'flow'] assert file_type in ['jpg', 'hdf5'] if 'somethingv1' in dataset_name: formatter = sthv1_image_name_formatter elif 'somethingv2' in dataset_name: formatter = sthv2_image_name_formatter else: formatter = image_name_formatter if file_type == 'jpg': assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' if get_image_backend() == 'accimage': from datasets.loader import ImageLoaderAccImage loader = VideoLoader(formatter, ImageLoaderAccImage()) else: loader = VideoLoader(formatter) video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) else: if input_type == 'rgb': loader = VideoLoaderHDF5() else: loader = VideoLoaderFlowHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / '{}.hdf5'.format(video_id)) if dataset_name == 'activitynet': training_data = ActivityNet(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) else: training_data = VideoDataset(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) return training_data
def get_validation_data(video_path, annotation_path, dataset_name, input_type, file_type, spatial_transform=None, temporal_transform=None, target_transform=None): assert dataset_name in [ 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit' ] assert input_type in ['rgb', 'flow'] assert file_type in ['jpg', 'hdf5'] if file_type == 'jpg': assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' if get_image_backend() == 'accimage': from datasets.loader import ImageLoaderAccImage loader = VideoLoader(image_name_formatter, ImageLoaderAccImage()) else: loader = VideoLoader(image_name_formatter) video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) else: if input_type == 'rgb': loader = VideoLoaderHDF5() else: loader = VideoLoaderFlowHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') if dataset_name == 'activitynet': validation_data = VideoDataset( video_path, annotation_path, 'validation', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) else: validation_data = VideoDatasetMultiClips( video_path, annotation_path, 'validation', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) return validation_data, collate_fn
def get_training_data(video_path, annotation_path, dataset_name, input_type, file_type, spatial_transform=None, temporal_transform=None, target_transform=None): assert dataset_name in [ 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit' ] assert input_type in ['rgb', 'flow'] assert file_type in ['jpg', 'hdf5'] if file_type == 'jpg': assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' loader = VideoLoader(image_name_formatter) video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) else: if input_type == 'rgb': loader = VideoLoaderHDF5() else: loader = VideoLoaderFlowHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') if dataset_name == 'activitynet': training_data = ActivityNet(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) else: training_data = VideoDataset(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) return training_data
def get_training_data(video_path, annotation_path, dataset_name, file_type, spatial_transform=None, temporal_transform=None, target_transform=None): assert dataset_name in [ 'kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit' ] assert file_type in ['jpg', 'hdf5'] if file_type == 'jpg': loader = VideoLoader(lambda x: f'image_{x:05d}.jpg') video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) else: loader = VideoLoaderHDF5() video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') if dataset_name == 'activitynet': training_data = ActivityNet(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) else: training_data = VideoDataset(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) return training_data
def __init__( self, info_pkl_json="misc/bsl1k/info-pkls.json", inp_res=224, resize_res=256, setname="train", scale_factor=0.1, num_in_frames=16, evaluate_video=False, hflip=0.5, stride=0.5, mouthing_prob_thres=0.9, gpu_collation=False, num_last_frames=20, featurize_mode=False, featurize_mask="", word_data_pkl=None, input_type="rgb", pose_keys=["body", "face", "lhnd", "rhnd"], mask_rgb=None, mask_type=None, bsl1k_pose_subset=False, bsl1k_anno_key="original-mouthings", ): self.setname = setname # train, val or test self.featurize_mode = featurize_mode self.featurize_mask = featurize_mask self.gpu_collation = gpu_collation self.inp_res = inp_res self.resize_res = resize_res self.scale_factor = scale_factor self.num_in_frames = num_in_frames self.evaluate_video = evaluate_video self.hflip = hflip self.stride = stride self.input_type = input_type self.pose_keys = pose_keys self.mask_rgb = mask_rgb self.mask_type = mask_type assert self.num_in_frames == 16 self.num_last_frames = num_last_frames print(f"Using only {self.num_last_frames} last frames of videos") with open(info_pkl_json, "r") as f: pkls = json.load(f)[bsl1k_anno_key] infofile = pkls["info"] self.video_folder = pkls["videos"] print(f"Loading {infofile}") data = pkl.load(open(infofile, "rb")) if self.input_type == "pose": pose_pkl = pkls["pose"] print(f"Loading {pose_pkl}") self.pose_data = pkl.load(open(pose_pkl, "rb")) if self.mask_rgb: assert bsl1k_pose_subset assert mask_type if self.mask_rgb == "face": face_pkl = pkls["face_bbox"] print(f"Loading {face_pkl}") self.face_data = pkl.load(open(face_pkl, "rb")) if bsl1k_pose_subset: # self.mask_rgb: mouth_pkl = pkls["mouth_bbox"] print(f"Loading {mouth_pkl}") self.mouth_data = pkl.load(open(mouth_pkl, "rb")) self.set_video_metadata(data, meta_key="videos", fixed_sz_frames=gpu_collation) subset_ix = self.set_class_names(data=data, word_data_pkl=word_data_pkl) self.train = list( np.where(np.asarray(data["videos"]["split"]) == 0)[0]) # train self.valid = list( np.where(np.asarray(data["videos"]["split"]) == 2)[0]) # test self.videos = [s.strip() for s in data["videos"]["name"]] # Take subsets based on 'mouthing_prob' confident_mouthing = np.where( np.asarray(data["videos"]["mouthing_prob"]) >= mouthing_prob_thres )[0] msg = ( f"Keeping {len(confident_mouthing)}/{len(data['videos']['mouthing_prob'])} " f"videos with more than {mouthing_prob_thres} mouthing confidence." ) print(msg) self.train = [i for i in self.train if i in confident_mouthing] self.valid = [i for i in self.valid if i in confident_mouthing] print("Taking subsets according to word vocab") self.train = list(set(self.train).intersection(set(subset_ix))) self.valid = list(set(self.valid).intersection(set(subset_ix))) if self.input_type == "pose": valid_pose_ix = np.where( np.array([i is not None for i in self.pose_data["pose"]]))[0] print(f"{len(self.train)} train, {len(self.valid)} val samples.") print("Taking subsets according to having pose or not") self.train = list(set(self.train).intersection(set(valid_pose_ix))) self.valid = list(set(self.valid).intersection(set(valid_pose_ix))) print(f"{len(self.train)} train, {len(self.valid)} val samples.") if bsl1k_pose_subset: # self.mask_rgb: # Valid mouth ix should be equivalent to valid face ix, so leaving this bit. valid_mouth_ix = np.where( np.array([i is not None for i in self.mouth_data]))[0] print(f"{len(self.train)} train, {len(self.valid)} val samples.") print("Taking subsets according to having pose or not") self.train = list( set(self.train).intersection(set(valid_mouth_ix))) self.valid = list( set(self.valid).intersection(set(valid_mouth_ix))) print(f"{len(self.train)} train, {len(self.valid)} val samples.") # Take a subset for validation if too large if self.setname == "val" and len(self.valid) > 1300: self.valid = self.valid[::int(len(self.valid) / 1300)] if evaluate_video: self.valid, self.t_beg = self._slide_windows(self.valid) VideoDataset.__init__(self)
def __init__( self, root_path="data/PHOENIX-2014-T-release-v3/PHOENIX-2014-T", inp_res=224, resize_res=256, setname="train", scale_factor=0.1, num_in_frames=16, evaluate_video=False, hflip=0.5, stride=0.5, gpu_collation=False, assign_labels="auto", ): self.root_path = root_path self.setname = setname # train, val or test self.gpu_collation = gpu_collation self.inp_res = inp_res self.resize_res = resize_res self.scale_factor = scale_factor self.num_in_frames = num_in_frames self.evaluate_video = evaluate_video self.hflip = hflip self.stride = stride self.assign_labels = assign_labels infofile = os.path.join(root_path, "info", "info.pkl") print(f"Loading {infofile}") data = pkl.load(open(infofile, "rb")) self.videos = [s.strip() for s in data["videos"]["name"]] other_class_ix = 1232 self.classes = data["videos"]["gloss_ids"] replace_cnt = 0 for i, seq in enumerate(self.classes): for j, gid in enumerate(seq): if gid == -1: replace_cnt += 1 self.classes[i][j] = other_class_ix print(f"Replaced {replace_cnt} -1s with {other_class_ix}") with open(os.path.join(self.root_path, "info", "words.txt"), "r") as f: self.class_names = f.read().splitlines() self.class_names.append("1232 __OTHER__") self.video_folder = "videos" meta_key = self.video_folder if gpu_collation: # GPU collation requires all inputs to share the same spatial input size self.video_folder = "videos-resized-256fps-256x256" self.set_video_metadata(data, meta_key=meta_key, fixed_sz_frames=gpu_collation) self.train = list(np.where(np.asarray(data["videos"]["split"]) == 0)[0]) if self.setname == "val": self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 1)[0]) elif self.setname == "test": self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 2)[0]) if self.assign_labels == "auto": self.frame_level_glosses = data["videos"]["alignments"]["gloss_id"] if evaluate_video: self.valid, self.t_beg = self._slide_windows(self.valid) VideoDataset.__init__(self)
def get_training_data(video_path, annotation_path, dataset_name, input_type, file_type, spatial_transform=None, temporal_transform=None, target_transform=None, sample_t_stride=1): assert dataset_name in [ 'kinetics', 'mini_kinetics', 'activitynet', 'ucf101', 'hmdb51', 'mit', 'breakfast', 'mini_breakfast', 'movingmnist', 'movingmnist_blackframes', 'movingmnist_longterm', 'movingmnist_motiondiff', 'movingmnist_motionsame', 'movingmnist_frequencies', 'movingmnist_frequencies_complex', 'something', 'movingmnist_static' ] assert input_type in ['rgb', 'flow'] assert file_type in ['jpg', 'hdf5', None] if file_type == 'jpg': assert input_type == 'rgb', 'flow input is supported only when input type is hdf5.' if 'movingmnist' in dataset_name: image_name_formatter = mnist_image_name_formatter elif 'something' in dataset_name: image_name_formatter = something_image_name_formatter else: image_name_formatter = usual_image_name_formatter if get_image_backend() == 'accimage': from datasets.loader import ImageLoaderAccImage loader = VideoLoader(image_name_formatter, ImageLoaderAccImage()) else: loader = VideoLoader(image_name_formatter) video_path_formatter = ( lambda root_path, label, video_id: root_path / label / video_id) if 'movingmnist' in dataset_name or 'something' in dataset_name: video_path_formatter = ( lambda root_path, label, video_id: root_path / video_id) else: if input_type == 'rgb': loader = VideoLoaderHDF5() else: loader = VideoLoaderFlowHDF5() if dataset_name in ['kinetics', 'mini_kinetics']: video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}') else: video_path_formatter = (lambda root_path, label, video_id: root_path / label / f'{video_id}.hdf5') print("video_path_formatter", video_path_formatter) if dataset_name == 'activitynet': training_data = ActivityNet(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) elif dataset_name in ['kinetics', 'mini_kinetics']: training_data = VideoDataset(Path( os.path.join(video_path, "h5_train_frames")), annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) else: print("Building VideoDataset for", dataset_name) #print(spatial_transform) #print(temporal_transform) #print(loader) training_data = VideoDataset(video_path, annotation_path, 'training', spatial_transform=spatial_transform, temporal_transform=temporal_transform, target_transform=target_transform, video_loader=loader, video_path_formatter=video_path_formatter) return training_data
parser.add_argument('-evd', '--extra_validation_dataset', default=None, help='extra validation dataset') parser.add_argument('-tdl', '--training_dataset_length', default=default_training_dataset_length, help='training dataset length', type=int) parser.add_argument('-vdl', '--validation_dataset_length', default=default_validation_dataset_length, help='validation dataset length', type=int) parser.add_argument('-b', '--batch', help='batch size', default=default_batch_size, type=int) parser.add_argument('-lr', '--learning_rate', help='learning rate', default=default_learning_rate, type=float) parser.add_argument('-ep', '--epochs', help='number of epochs', default=default_number_epochs, type=int) parser.add_argument('-sr','--safe_reset', help="if output file exists, use that as input file to prevent lost work", action="store_true") parser.add_argument('-su','--supervised', help="train supervised", action="store_true") parser.add_argument('-n','--net', default="original", help="choose net name") parser.add_argument('-m','--map', default="homographymap", help="choose map name") parser.add_argument('-t', '--type', default="training", help="type of dataset: training or video") args = parser.parse_args() if args.type == "video": print "Video dataset" dataset = VideoDataset("datasets/%s" % args.training_dataset, args.training_dataset_length) val_dataset = VideoDataset("datasets/%s" % args.validation_dataset, args.validation_dataset_length) else: print "Training dataset" dataset = TrainingDataset("datasets/%s" % args.training_dataset, args.training_dataset_length) val_dataset = TrainingDataset("datasets/%s" % args.validation_dataset, args.validation_dataset_length) if args.extra_validation_dataset is not None: extra_val_dataset = VideoDataset("datasets/%s" % args.extra_validation_dataset) else: extra_val_dataset = None input_checkpoint = None if args.input is None else "checkpoints/%s" % args.input output_checkpoint = None if args.output is None else "checkpoints/%s" % args.output
def __init__( self, root_path="data/wlasl", inp_res=224, resize_res=256, setname="train", scale_factor=0.1, num_in_frames=64, evaluate_video=False, hflip=0.5, stride=0.5, ram_data=True, gpu_collation=False, use_bbox=True, monolithic_pkl_path="data/pickled-videos/wlasl-compressed-quality-90-resized-256x256.pkl", input_type="rgb", pose_keys=["body", "face", "lhnd", "rhnd"], mask_rgb=None, mask_type=None, mask_prob=1.0, ): self.root_path = root_path self.setname = setname # train, val or test self.inp_res = inp_res self.resize_res = resize_res self.scale_factor = scale_factor self.num_in_frames = num_in_frames self.evaluate_video = evaluate_video self.hflip = hflip self.gpu_collation = gpu_collation self.stride = stride self.use_bbox = use_bbox self.input_type = input_type self.pose_keys = pose_keys self.mask_rgb = mask_rgb self.mask_type = mask_type self.video_folder = "videos_360h_25fps" if Path(monolithic_pkl_path).exists() and ram_data: print(f"Loading monolithic pickle from {monolithic_pkl_path}") self.video_data_dict = memcache(monolithic_pkl_path) else: self.video_data_dict = None infofile = os.path.join(root_path, "info", "info.pkl") print(f"Loading {infofile}") data = pkl.load(open(infofile, "rb")) if self.input_type == "pose": pose_pkl = os.path.join(root_path, "info", "pose.pkl") print(f"Loading {pose_pkl}") self.pose_data = pkl.load(open(pose_pkl, "rb")) if self.mask_rgb: assert mask_type if self.mask_rgb == "face": face_pkl = os.path.join(root_path, "info", "face_bbox.pkl") print(f"Loading {face_pkl}") self.face_data = pkl.load(open(face_pkl, "rb")) # Use this to take subset if self.input_type == "pose" or self.mask_rgb: mouth_pkl = os.path.join(root_path, "info", "mouth_bbox.pkl") print(f"Loading {mouth_pkl}") self.mouth_data = pkl.load(open(mouth_pkl, "rb")) self.videos = [s.strip() for s in data["videos"]["name"]] self.videos = np.asarray(self.videos) self.classes = data["videos"]["word_id"] with open(os.path.join(self.root_path, "info", "words.txt"), "r") as f: self.class_names = f.read().splitlines() meta_key = self.video_folder if gpu_collation and not self.video_data_dict: # GPU collation requires all inputs to share the same spatial input size self.video_folder = "videos-resized-256fps-256x256" self.set_video_metadata(data, meta_key=meta_key, fixed_sz_frames=gpu_collation) bboxes_orig = [s for s in np.asarray(data["videos"]["box"])] self.bboxes = [] for i, bb in enumerate(bboxes_orig): ht = data["videos"]["videos_original"]["H"][i] wt = data["videos"]["videos_original"]["W"][i] xmin, ymin, xmax, ymax = bb bb_norm = [ymin / ht, xmin / wt, ymax / ht, xmax / wt] self.bboxes.append(bb_norm) self.train = list(np.where(np.asarray(data["videos"]["split"]) == 0)[0]) if self.setname == "val": self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 1)[0]) elif self.setname == "test": self.valid = list(np.where(np.asarray(data["videos"]["split"]) == 2)[0]) if self.input_type == "pose" or self.mask_rgb: # Valid mouth ix should be equivalent to valid face ix, valid pose ix etc valid_mouth_ix = np.where( np.array([i is not None for i in self.mouth_data]) )[0] if self.setname == "val" or self.setname == "test": print(f"{len(self.train)} train, {len(self.valid)} val samples.") print("Taking subsets according to having pose or not") self.train = list(set(self.train).intersection(set(valid_mouth_ix))) if self.setname == "val" or self.setname == "test": self.valid = list(set(self.valid).intersection(set(valid_mouth_ix))) print(f"{len(self.train)} train, {len(self.valid)} val samples.") if evaluate_video: self.valid, self.t_beg = self._slide_windows(self.valid) VideoDataset.__init__(self)