def retrieve_action_pair_clips_dict(mode='train'): ''' Similar to retrieve_action_pair_clips(), but does not condition on actions, and returns every available pair in the dataset organized by dictionary keys instead. ''' # Format with participant_id, video_id, frame_number video_path = '/local/vondrick/epic-kitchens/raw/rgb/{}/{}/frame_{:010d}.jpg' gulp_subfolder = ('rgb_train' if mode == 'train' or mode == 'val' else 'rgb_' + mode) gulp_path = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp/' + gulp_subfolder epic_inst = list(EpicVideoDataset(gulp_path, 'verb+noun')) epic_inst.sort( key=(lambda k: k.video_id + '_{:010d}'.format(k.start_frame))) # maps (verb_A, noun_A, verb_B, noun_B) to list of clip metadata result = defaultdict(list) # Loop over all pairs of action segments for i in range(len(epic_inst) - 1): segment_A = epic_inst[i] segment_B = epic_inst[i + 1] # Proceed only if same video if segment_A.video_id != segment_B.video_id: continue # Set action pair as key cur_key = (segment_A.verb_class, segment_A.noun_class, segment_B.verb_class, segment_B.noun_class) # Append clip cur_item = (segment_A.video_id, segment_A.start_frame, segment_A.num_frames, segment_B.start_frame - (segment_A.start_frame + segment_A.num_frames), segment_B.num_frames) result[cur_key].append(cur_item) return result
def __init__(self, mode='train', transform=None, seq_len=5, num_seq=8, downsample=6, class_type='verb+noun', drive='ssd'): print( '-- WARNING! -- using obsolete dataset class, see utils/dataset_epic.py and dataset_other.py instead' ) self.mode = mode self.transform = transform self.seq_len = seq_len self.num_seq = num_seq self.downsample = downsample self.class_type = class_type if drive == 'ssd': gulp_root = '/local/vondrick/epic-kitchens/gulp' else: gulp_root = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp' print(os.path.join(gulp_root, 'rgb_train', self.class_type)) self.EpicDataset = EpicVideoDataset( os.path.join(gulp_root, 'rgb_train'), self.class_type) dataset = list(self.EpicDataset) rgb = [] for i in range(len(dataset)): # remove segments that are too short if dataset[ i].num_frames > self.seq_len * self.num_seq * self.downsample: rgb.append(dataset[i]) del dataset train_idx = random.sample(range(1, len(rgb)), int(float(len(rgb)) * 0.8)) rgb_train = [] rgb_val = [] for i in range(len(rgb)): if i in train_idx: rgb_train.append(rgb[i]) else: rgb_val.append(rgb[i]) if self.mode == 'train': self.video_info = rgb_train elif self.mode in ['val']: self.video_info = rgb_val
def gen_label(gulp_dir, interim_dir, out, split_path): with open(split_path, 'r') as f: trainval = json.load(f) idxsplit = (len(trainval['train']) + len(trainval['val']))*[None] for i in trainval['train']: idxsplit[i] = 'train' for i in trainval['val']: idxsplit[i] = 'val' assert None not in idxsplit action_classes = {} class_counts = {} next_action_class = 0 rgbviddata = EpicVideoDataset(f'{gulp_dir}/rgb_train', 'verb+noun') outputs = {'train': [], 'val': []} categories = [] for i, seg in enumerate(rgbviddata.video_segments): parid = seg['participant_id'] vidid = seg['video_id'] nar = seg['narration'].replace(' ', '-') uid = seg['uid'] reldir = f'{parid}/{vidid}/{vidid}_{uid}_{nar}' assert os.path.exists(f'{interim_dir}/{reldir}'), f'{interim_dir}/{reldir}' verb = seg['verb_class'] noun = seg['noun_class'] action = f'{verb},{noun}' if action in action_classes: classidx = action_classes[action] class_counts[action] += 1 else: categories.append(f'{seg["verb"]} {seg["noun"]}') classidx = next_action_class action_classes[action] = classidx class_counts[action] = 1 next_action_class += 1 nframes = seg['num_frames'] outputs[idxsplit[i]].append(f'{reldir} {nframes} {classidx}') assert len(set(categories)) == len(categories) with open(f'{out}/category.txt', 'w') as f: f.write('\n'.join(categories)) with open(f'{out}/train_videofolder.txt', 'w') as f: f.write('\n'.join(outputs['train'])) with open(f'{out}/val_videofolder.txt', 'w') as f: f.write('\n'.join(outputs['val'])) class_counts = list(class_counts.values()) class_counts.sort() plt.bar(range(0, len(class_counts)), class_counts) plt.savefig('action_class_histogram.png')
def __init__(self, mode='train', transform=None, seq_len=6, num_seq=5, downsample=3, class_type='verb+noun'): self.mode = mode self.transform = transform self.seq_len = seq_len self.num_seq = num_seq self.downsample = downsample self.class_type = class_type gulp_root = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp' print(os.path.join(gulp_root, 'rgb_train', self.class_type)) self.EpicDataset = EpicVideoDataset( os.path.join(gulp_root, 'rgb_train'), self.class_type) dataset = list(self.EpicDataset) rgb = [] for i in range(len(dataset)): # remove segments that are too short if dataset[ i].num_frames > self.seq_len * self.num_seq * self.downsample: rgb.append(dataset[i]) del dataset train_idx = random.sample(range(1, len(rgb)), int(float(len(rgb)) * 0.8)) rgb_train = [] rgb_val = [] for i in range(len(rgb)): if i in train_idx: rgb_train.append(rgb[i]) else: rgb_val.append(rgb[i]) if self.mode == 'train': self.video_info = rgb_train elif self.mode in ['val']: self.video_info = rgb_val
def retrieve_action_pair_clips(verb_A, noun_A, verb_B, noun_B, mode='train'): ''' Returns a list of all identifying metadata for Epic video clip sequences that depict an ordered sequence of actions "verb_A noun_A" -> "verb_B noun_B". Arguments can be either class indices or strings. Here, every item is: (video_id, start_frame, frames_A, frames_gap, frames_B), for example: ('P12_08', 1234, 200, 100, 200) where total video clip frames is 500. mode: train / val / test. WARNING: same gulp folder is used for train & val, i.e. split is NOT made here! ''' # Format with participant_id, video_id, frame_number video_path = '/local/vondrick/epic-kitchens/raw/rgb/{}/{}/frame_{:010d}.jpg' gulp_subfolder = ('rgb_train' if mode == 'train' or mode == 'val' else 'rgb_' + mode) gulp_path = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp/' + gulp_subfolder epic_inst = list(EpicVideoDataset(gulp_path, 'verb+noun')) epic_inst.sort( key=(lambda k: k.video_id + '_{:010d}'.format(k.start_frame))) result = [] # Loop over all pairs of action segments for i in range(len(epic_inst) - 1): segment_A = epic_inst[i] segment_B = epic_inst[i + 1] # Proceed only if same video if segment_A.video_id != segment_B.video_id: continue # Condition on first action if not(verb_A in [segment_A.verb, segment_A.verb_class]) or \ not(noun_A in [segment_A.noun, segment_A.noun_class]): continue # Condition on second action if not(verb_B in [segment_B.verb, segment_B.verb_class]) or \ not(noun_B in [segment_B.noun, segment_B.noun_class]): continue # Append clip cur_item = (segment_A.video_id, segment_A.start_frame, segment_A.num_frames, segment_B.start_frame - (segment_A.start_frame + segment_A.num_frames), segment_B.num_frames) result.append(cur_item) return result
from detectron2 import model_zoo from detectron2.engine.defaults import DefaultPredictor from detectron2.config import get_cfg from detectron2.utils.visualizer import Visualizer from detectron2.data import MetadataCatalog from detectron2.structures import BoxMode from gulpio import GulpDirectory from epic_kitchens.dataset.epic_dataset import EpicVideoDataset from gulpio.transforms import Scale, CenterCrop, Compose, UnitNorm from read_gulpio import EpicDataset class_type = 'noun' rgb_train = EpicVideoDataset('../../epic/data/processed/gulp/rgb_train', class_type) transforms = Compose([]) dataset = EpicDataset(transforms) segment_uids = list(rgb_train.gulp_dir.merged_meta_dict.keys()) exsample_segment = rgb_train.video_segments[10] exsample_frames = rgb_train.load_frames(exsample_segment) dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True) for batch_num, (data, label) in enumerate(dataloader): frame = data[0].to('cpu').detach().numpy().copy() frame = frame.transpose(1, 2, 3, 0) frame = np.squeeze(frame) break im = frame cfg = get_cfg()
class epic_gulp(data.Dataset): def __init__(self, mode='train', transform=None, seq_len=6, num_seq=5, downsample=3, class_type='verb+noun'): self.mode = mode self.transform = transform self.seq_len = seq_len self.num_seq = num_seq self.downsample = downsample self.class_type = class_type gulp_root = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp' print(os.path.join(gulp_root, 'rgb_train', self.class_type)) self.EpicDataset = EpicVideoDataset( os.path.join(gulp_root, 'rgb_train'), self.class_type) dataset = list(self.EpicDataset) rgb = [] for i in range(len(dataset)): # remove segments that are too short if dataset[ i].num_frames > self.seq_len * self.num_seq * self.downsample: rgb.append(dataset[i]) del dataset train_idx = random.sample(range(1, len(rgb)), int(float(len(rgb)) * 0.8)) rgb_train = [] rgb_val = [] for i in range(len(rgb)): if i in train_idx: rgb_train.append(rgb[i]) else: rgb_val.append(rgb[i]) if self.mode == 'train': self.video_info = rgb_train elif self.mode in ['val']: self.video_info = rgb_val def idx_sampler(self, index): vlen = self.video_info[index].num_frames if vlen - self.num_seq * self.seq_len * self.downsample <= 0: return None n = 1 start_idx = np.random.choice( range(vlen - self.num_seq * self.seq_len * self.downsample), n) # print ("start_idx:", start_idx) seq_idx = np.expand_dims(np.arange( self.num_seq), -1) * self.downsample * self.seq_len + start_idx # print ("seq_idx:", seq_idx) seq_idx_block = seq_idx + \ np.expand_dims(np.arange(self.seq_len), 0) * self.downsample # print ("seq_idx_block:", seq_idx_block) return seq_idx_block def __getitem__(self, index): idx_block = self.idx_sampler(index) # print(idx_block) # print(index) # print(len(self.video_info)) assert idx_block.shape == (self.num_seq, self.seq_len) idx_block = idx_block.reshape(self.num_seq * self.seq_len) #print ("idx_block, ", idx_block) segment = self.EpicDataset.load_frames(self.video_info[index]) seq = [segment[i] for i in idx_block] # do we need it here t_seq = self.transform(seq) # apply same transform num_crop = None try: (C, H, W) = t_seq[0].size() t_seq = torch.stack(t_seq, 0) except: (C, H, W) = t_seq[0][0].size() tmp = [torch.stack(i, 0) for i in t_seq] assert len(tmp) == 5 num_crop = 5 t_seq = torch.stack(tmp, 1) t_seq = t_seq.view(self.num_seq, self.seq_len, C, H, W).transpose(1, 2) action = torch.LongTensor([self.video_info[index].verb_class]) noun = torch.LongTensor([self.video_info[index].noun_class]) # OLD: return sequence only # return t_seq, action, noun # NEW: return all useful information in a dictionary result = { 't_seq': t_seq, 'idx_block': idx_block, 'vpath': 'TODO, idk how to retrieve', 'action': action, 'noun': noun } return result def __len__(self): return len(self.video_info)
def main(conf, test_set, test_part=-1): gulp_path = os.path.join(conf.gulp_test_dir, conf.modality.lower(), 'test', test_set) gulp_path = os.path.realpath(gulp_path) gulp_path = Path(gulp_path) classes_map = pickle.load(open(conf.classes_map, "rb")) conf.num_classes = count_num_classes(classes_map) net = TSN(conf.num_classes, 1, conf.modality, base_model=conf.arch, consensus_type=conf.crop_fusion_type, dropout=conf.dropout) checkpoint = torch.load(conf.weights) print("Model epoch {} best prec@1: {}".format(checkpoint['epoch'], checkpoint['best_prec1'])) base_dict = { '.'.join(k.split('.')[1:]): v for k, v in list(checkpoint['state_dict'].items()) } net.load_state_dict(base_dict) if conf.test_crops == 1: cropping = torchvision.transforms.Compose([ GroupScale(net.scale_size), GroupCenterCrop(net.input_size), ]) elif conf.test_crops == 10: cropping = torchvision.transforms.Compose( [GroupOverSample(net.input_size, net.scale_size)]) else: raise ValueError( "Only 1 and 10 crops are supported while we got {}".format( conf.test_crops)) class_type = 'verb+noun' if conf.class_type == 'action' else conf.class_type if conf.modality == 'Flow': dataset = EpicVideoFlowDataset(gulp_path=gulp_path, class_type=class_type) else: dataset = EpicVideoDataset(gulp_path=gulp_path, class_type=class_type) data_loader = torch.utils.data.DataLoader(EpicTSNTestDataset( dataset, classes_map, num_segments=conf.test_segments, new_length=1 if conf.modality == "RGB" else 5, modality=conf.modality, transform=torchvision.transforms.Compose([ cropping, Stack(roll=conf.arch == 'BNInception'), ToTorchFormatTensor(div=conf.arch != 'BNInception'), GroupNormalize(net.input_mean, net.input_std), ]), part=test_part), batch_size=1, shuffle=False, num_workers=conf.workers * 2, pin_memory=True) net = torch.nn.DataParallel(net, device_ids=conf.gpus).cuda() net.eval() total_num = len(data_loader.dataset) output = [] proc_start_time = time.time() for i, (keys, input_) in enumerate(data_loader): rst = eval_video(conf, (i, keys, input_), net) output.append(rst[1:]) cnt_time = time.time() - proc_start_time print('video {} done, total {}/{}, average {} sec/video'.format( i, i + 1, total_num, float(cnt_time) / (i + 1))) video_index = [x[0] for x in output] scores = [x[1] for x in output] save_scores = './{}/tsn_{}_{}_testset_{}_{}_lr_{}_model_{:03d}.npz'.format( conf.checkpoint, conf.class_type, conf.modality.lower(), test_set, conf.arch, conf.lr, checkpoint['epoch']) if test_part > 0: save_scores = save_scores.replace('.npz', '_part-{}.npz'.format(test_part)) np.savez(save_scores, segment_indices=video_index, scores=scores)
def __init__(self, mode='train', transform=None, seq_len=5, num_seq=8, downsample=6, class_type='both', train_val_split=0.2, verb_subset=None, noun_subset=None, participant_subset=None, drive='ssd', sample_method='within', sample_offset=0, label_fraction=1.0): ''' mode: train / val / test_seen / test_unseen. seq_len: Number of frames in a video block. num_seq: Number of video blocks in a clip / sequence. downsample: Temporal sampling rate of frames. The effective new FPS becomes old FPS / downsample. class_type: verb / noun / both. label_fraction: < 1.0 means use fewer labels for training. verb_subset: List of verb strings to condition on, if not None. noun_subset: List of noun strings to condition on, if not None. participant_subset: List of participants to condition on, if not None. Examples: verb_subset = ['take', 'open', 'close'], participant_subset = ['P16', 'P11']. sample_method: within / match_start / match_end / before - within = uniformly randomly sample sequence fully within an action label segment (e.g. for action classification) - match_start = START of sequence matches START of action label segment - match_end = END of sequence matches END of action label segment (e.g. for future uncertainty ranking) - before = END of sequence matches START of action label segment (e.g. for action anticipation) (NOTE: 'within' discards too short segments, all other methods do not.) sample_offset: Number of video blocks to shift sequence sampling by. Example 1: if (sample_method == 'match_start', sample_offset == -2), then video sequence starts already 2 blocks before the current action starts. Example 2: if (sample_method == 'match_end', sample_offset == 3, pred_step == 3), then all warmup video blocks represent the current action in progress, but all predicted blocks represent another, unknown action. Example 3: if (sample_method == 'before', sample_offset == 1, pred_step == 3), then only the LAST predicted block represents the current action, all preceding blocks represent something else. ''' if not (class_type in ['verb', 'noun']): class_type = 'verb+noun' # print('=> class_type is now set to both a.k.a. verb+noun') self.mode = mode self.transform = transform self.seq_len = seq_len self.num_seq = num_seq self.downsample = downsample self.block_frames = seq_len * downsample # number of frames in one video block # number of frames in one complete sequence self.seq_frames = seq_len * num_seq * downsample self.class_type = class_type self.train_val_split = train_val_split self.verb_subset = verb_subset self.noun_subset = noun_subset self.participant_subset = participant_subset self.drive = drive self.sample_method = sample_method self.sample_offset = sample_offset self.label_fraction = label_fraction # Verify arguments if not (mode in ['train', 'val', 'test_seen', 'test_unseen']): raise ValueError('Unknown dataset mode: ' + mode) # Specify paths, both gulp (= main) and jpg (= backup) # JPG path format arguments: if drive == 'ssd': gulp_root = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp' self.jpg_path = '/local/vondrick/epic-kitchens/raw/rgb/{}/{}/frame_{:010d}.jpg' else: print('== WARNING! == using HDD instead of SSD') gulp_root = '/proj/vondrick/datasets/epic-kitchens/data/processed/gulp' self.jpg_path = '/proj/vondrick/datasets/epic-kitchens/data/raw/rgb/{}/{}/frame_{:010d}.jpg' # Load video info (RGB frames) subfolder = ('rgb_train' if mode == 'train' or mode == 'val' else 'rgb_' + mode) full_path = os.path.join(gulp_root, subfolder) print('Selected dataset:', full_path, self.class_type) self.epic_dataset_inst = EpicVideoDataset(full_path, self.class_type) # Split dataset randomly into train & validation with fixed seed # NOTE: this split will be different for other values of train_val_split dataset = list(self.epic_dataset_inst) if mode in ['train', 'val']: rand_state = random.getstate() random.seed(8888) train_list = random.sample( dataset, int(len(dataset) * (1 - self.train_val_split))) # without replacement random.setstate(rand_state) # retain & restore random state if label_fraction < 1.0: print( '== WARNING! == using just a fraction of available labels for training: ' + str(label_fraction * 100) + '%') used_train_len = int(label_fraction * len(train_list)) train_list = train_list[: used_train_len] # deterministic operation because of fixed seed above if mode == 'train': dataset = train_list elif mode == 'val': train_set = set(train_list) val_list = [] for item in dataset: if item not in train_set: val_list.append(item) dataset = val_list # Loop over segments in epic dataset and filter out videos rgb = [] for i in range(len(dataset)): # If within, retain only sufficiently long video clips if sample_method == 'within' and dataset[ i].num_frames <= self.seq_frames: continue # Condition on verbs if verb_subset is not None and not (dataset[i].verb in verb_subset): continue # Condition on nouns if noun_subset is not None and not (dataset[i].noun in noun_subset): continue # Condition on participants if participant_subset is not None and not ( dataset[i].participant_id in participant_subset): continue rgb.append(dataset[i]) self.video_info = rgb del dataset