def prepare(self, labels): FPS= 30 video_path, gop_index, targets, ids = [], [], [], [] for i, (vid, label) in enumerate(labels.items()): vpath = os.path.join(self._data_root, vid + '.mp4') num_frames = get_num_frames(vpath) num_gop = num_frames // GOP_SIZE # # one hot # for gop in range(num_gop//5): # target = torch.IntTensor(157).zero_() # c = 0 # for x in label: # if (x['start'] < gop*5*0.4) and ((gop*5+1)*0.4 < x['end']): # target[cls2int(x['class'])] = 1 # c = c+1 # print(target, c) # # print(c) # if c != 0 : # gop_index.append(gop*5) # video_path.append(vpath) # targets.append(target) # ids.append(vid) # one target one gop for x in label: for gop in range(num_gop): if (x['start'] < gop*5*0.4) and ((gop*5+1)*0.4 < x['end']): video_path.append(vpath) gop_index.append(gop) targets.append(cls2int(x['class'])) ids.append(vid) # if self._is_train : # for x in label: # for gop in range(num_gop): # if (x['start'] < gop*0.4) and ((gop+1)*0.4 < x['end']): # video_path.append(vpath) # gop_index.append(gop) # targets.append(cls2int(x['class'])) # ids.append(vid) # else: # target = torch.IntTensor(157).zero_() # for x in label: # target[cls2int(x['class'])] = 1 # for gop in range(num_gop): # video_path.append(vpath) # gop_index.append(gop) # targets.append(target) # ids.append(vid) # print(video_path, gop_index, targets, ids) print(gop_index) return {'video_path': video_path, 'gop_index': gop_index, 'targets': targets, 'ids': ids}
def _load_video(self, video_name): #選擇擷取特徵 representation_idx = 0 if self._representation == 'mv': representation_idx = 1 elif self._representation == 'residual': representation_idx = 2 #計算片段數 total_frames = get_num_frames(video_name) total_segments = total_frames // SEG_SIZE #把每個片段中間那幀紀錄下來 frames = [] for i in range(total_segments): gop_idx, gop_pos = self._get_frame_index(total_frames, i) img = load(video_name, gop_idx, gop_pos, representation_idx, self._accumulate) roi_img = img[int(ROI_Y):int(ROI_Y+ROI_HEIGHT), int(ROI_X):int(ROI_X+ROI_WIDTH)] frames.append(roi_img) #預設是每3個片段辨識一個動作 for i in range(2, len(frames)): tmp = [] tmp.append(frames[i-2]) tmp.append(frames[i-1]) tmp.append(frames[i]) self._frames.append(tmp) frames.clear()
def _load_list(self, video_list): self._video_list = [] with open(video_list, 'r') as f: for line in f: video, _, label = line.strip().split() video_path = os.path.join(self._data_root, video[:-4] + '.mp4') self._video_list.append( (video_path, int(label), get_num_frames(video_path))) print('%d videos loaded.' % len(self._video_list))
def _load_list(self, video_list): self._video_list = [] if self._dataset == 'ucf101': # for ucf with open(video_list, 'r') as f: for line in f: video, _, label = line.strip().split(' ') video_path = os.path.join(self._data_root, video[:-4] + '.mp4') self._video_list.append( (video_path, int(label), get_num_frames(video_path))) if self._dataset == 'kinetics400': # for kinetics with open(video_list, 'r') as f: for line in f: video, label = line.strip().split(',') video_path = os.path.join(self._data_root, video) self._video_list.append( (video_path, int(label), get_num_frames(video_path))) print('%d videos loaded.' % len(self._video_list))
def _load_list(self, video_list): self._video_list = [] with open(video_list, 'r') as f: for line in f: video, _, label = line.strip().split() video_path = os.path.join(self._data_root, video[:-4] + '.mp4') self._video_list.append(( video_path, int(label), get_num_frames(video_path))) print('%d videos loaded.' % len(self._video_list))
def _parse_function_v2(filename, label, nSegments): reps_np = [] for representation_idx in range(0, 3): frames = [] for seg_idx in range(0, nSegments): #print(filename.decode()) nFrames = get_num_frames(filename.decode()) #print('nFrames:',nFrames) gop_index, gop_pos = getTrainFrameIndex(nFrames, seg_idx, nSegments, representation_idx) #print('gop_index, gop_pos:', gop_index, gop_pos) img = load(filename.decode(), gop_index, gop_pos, representation_idx, True) #print('H3') if img is None: #print('Error: loading video %s failed.' % filename.decode()) img = np.zeros((256, 256, 3)) else: if representation_idx == 1: img = (img * (127.5 / 20)).astype(np.int32) img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) img = np.append(img, np.zeros_like(img[..., 0, None]), axis=-1) elif representation_idx == 2: img += 128 img = (np.minimum(np.maximum(img, 0), 255)).astype(np.uint8) else: img = img[..., ::-1] #flipping to RGB #print('H4') frames.append(img) #np_frames = np.transpose(np.array(frames).astype(np.float32), (0, 3, 1, 2)) / 255.0 np_frames = np.array(frames).astype(np.float32) / 255.0 if representation_idx == 0: np_frames = (np_frames - DATA_MEAN) / DATA_STD elif representation_idx == 2: np_frames = (np_frames - 0.5) / DATA_STD elif representation_idx == 1: np_frames = (np_frames - 0.5) np_frames = np_frames[:, 16:240, 52:276, :].astype(np.float32) reps_np.append(np_frames) return reps_np[0], reps_np[1], reps_np[2], label
def _load_list(self, video_list): self._video_list = [] with open(video_list, 'r') as f: for line in f: video, _, label = line.strip().split() #print('video:{}'.format(video)) video:WritingOnBoard/v_WritingOnBoard_g25_c07.avi #print('-:{}'.format(_)) WritingOnBoard #print('label:{}'.format(int(label))) 99 video_path = os.path.join(self._data_root, video[:-4] + '.mp4') self._video_list.append( (video_path, int(label), get_num_frames(video_path) )) # questions about get_num_frames???????????? #print(self._video_list) print('{} videos loaded.'.format(len(self._video_list)))
def main(): #for video_name in video_names[:2]: for video_name in video_names: fold_path = video_name.split('.avi')[0].split('/')[-1] path_mv = os.path.join(fold_path, PATH_MV_CONT) path_res = os.path.join(fold_path, PATH_RES_CONT) if not os.path.exists(path_mv): os.makedirs(path_mv) if not os.path.exists(path_res): os.makedirs(path_res) NUM_FRAMES = get_num_frames(video_name) print(NUM_FRAMES) # The index of GOP curGopIdx = 0 for curGopIdx in range(max(NUM_FRAMES // GOP_FRAMES_NUM, 1)): for innerGopIdx in range(GOP_FRAMES_NUM): curFrameIdx = curGopIdx * GOP_FRAMES_NUM + innerGopIdx #rgbFrame = load(video_name, curGopIdx, innerGopIdx, 0, True) #start = time.time() print(video_name, curGopIdx, innerGopIdx) mvCont_origin = load(video_name, curGopIdx, innerGopIdx, 1, False) resCont = load(video_name, curGopIdx, innerGopIdx, 2, False) if mvCont_origin is None: mvCont_origin = np.zeros([720,960,2], dtype=np.uint8) mvCont = mvCont_origin + 2048 # (high_h, low_h, high_w, low_w) mvPng = np.array([((mvCont[:,:,0] >> 8) & 0xff) , (mvCont[:,:,0] & 0xff), ((mvCont[:,:,1] >> 8) & 0xff), (mvCont[:,:,1] & 0xff)], dtype = np.uint8) mvPng = np.transpose(mvPng, [1,2,0]) imsave(path_mv+'/frame'+str(curFrameIdx)+'.png', mvPng) #save_mvPng = imread(path_mv+'/frame'+str(curFrameIdx)+'.png').astype(np.int16) #reload_mvCont = np.array([ (save_mvPng[:,:,0] << 8) + (save_mvPng[:,:,1]), (save_mvPng[:,:,2] << 8) + (save_mvPng[:,:,3]) ]) #reload_mvCont = np.transpose(reload_mvCont, [1,2,0]) #reload_mvCont -= 2048 #print((reload_mvCont == mvCont_origin).min()) if resCont is None: resCont = np.zeros([720,960,3], dtype=np.uint8) resCont = np.round((resCont + 256)/2).astype(np.uint8) #resCont = np.abs(resCont) imsave(path_res+'/frame'+str(curFrameIdx)+'.png', resCont) cv2.imwrite(PATH_RES_CONT+fold_path+'.png', resCont)
def _load_list(self, video_list): # video_list: e.g. ucf101_split1_train.txt self._video_list = [] with open(video_list, 'r') as f: for line in f: # ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01.avi ApplyEyeMakeup 0 # video = *.avi # label = 0 video, _, label = line.strip().split() video_path = os.path.join(self._data_root, video[:-4] + '.mp4') self._video_list.append( (video_path, int(label), get_num_frames(video_path))) # get_num_frames, METH_VARARGS, "Getting number of frames in a video."} # _video_list: path(*.avi), label(0), number of frames print('%d videos loaded.' % len(self._video_list))
def load_list(video_list): # video_list: e.g. ucf101_split1_train.txt _video_list = [] with open(video_list, 'r') as f: for line in f: # ApplyEyeMakeup/v_ApplyEyeMakeup_g01_c01.avi ApplyEyeMakeup 0 # video = *.avi # label = 0 video, folder, label = line.strip().split() video_path = os.path.join(data_root, video[:-4] + '.mp4') video_name = video[video.rfind('/') + 1:-4] num_frames = get_num_frames(video_path) _video_list.append( (video_path, folder, video_name, int(label), num_frames)) # _video_list: path(*.avi), foldername(ApplyEyeMakeup),label(0), number of frames with open('video_list.p', 'wb') as list_p: pickle.dump(_video_list, list_p)
def _perturbation_image(model, original_image, ori_label, video_path, save_path, transform_post, args, config, device): original_image = original_image.to(device) total_frames = get_num_frames(video_path) original_image_ = original_image.clone() # torch.Size([1, 3, 72, 84, 84]) num_frame, channel, height, width = original_image.shape dim = height * width * channel loop = 0 inner_loop = 0 success = False num_query = 0 num_pframe = 0 max_query = 60000 exploration = 0.1 fd_eta = 0.1 online_lr = 0.1 flow_lr = 0.025 target_label = (ori_label + 1) % args.num_classes ''' while target_label == ori_label: target_label = torch.tensor([random.sample(range(174), 1)[0]]).cuda() ''' motion_vector = list() prior = torch.zeros(num_frame, channel, height, width).to(device) delta = torch.zeros(num_frame, channel, height, width).to(device) est_grad = torch.zeros(num_frame, channel, height, width).to(device) adv_img = torch.zeros(3, num_frame, channel, height, width).to(device) iframe = torch.zeros(num_frame, height, width, channel).to(device) noise_frames = torch.zeros(num_frame, channel, height, width).to(device) index_visual = torch.zeros(num_frame, 2, height, width).to(device) index_motion = torch.zeros(num_frame, height, width, 2).to(device) while not (num_query > max_query): pred_adv_logit = list() start1 = time.time() end_index = total_frames // GOP_SIZE if loop % args.interval == 0: # can also try 8 for tsn2d #mv_index = int(torch.rand(1)*end_index) mv_index = inner_loop % end_index mv = load(video_path, mv_index, 11, 1, True) mv = mv - mv.min() mv = np.dstack((mv, np.zeros((mv.shape[:2] + (1, ))))) mv = [mv.astype(np.uint8)] * num_frame inner_loop += 1 motion_vector = transform_post(mv) motion_vector = np.stack(motion_vector, axis=0) * 255 motion_vector = torch.from_numpy(motion_vector).permute( 0, 2, 3, 1).float().to(device) motion_vector[:, :, :, 0] = (2 * motion_vector[:, :, :, 0] - height + 1.) / (height - 1.) motion_vector[:, :, :, 1] = (2 * motion_vector[:, :, :, 1] - width + 1.) / (width - 1.) noise_frames = torch.randn(1, 3, height, width).repeat(num_frame, 1, 1, 1).to(device) noise_frames = F.grid_sample(noise_frames, motion_vector[:, :, :, :2]) exp_noise = exploration * noise_frames q1 = prior + exp_noise q2 = prior - exp_noise adv_img[0] = original_image + fd_eta * q1 / norm2(q1) adv_img[1] = original_image + fd_eta * q2 / norm2(q2) adv_img[2] = original_image for i in range(3): img_group = normalization(adv_img[i].clone().cpu().numpy(), args) tmp_result = model(img_group.astype('float32', copy=False)) tmp_result = FF.mean(tmp_result, axis=0, keepdims=True) tmp_result = torch.from_numpy(tmp_result.asnumpy()).to(device) pred_adv_logit.append(tmp_result) l1, _, _, _, _, _, _ = _pert_loss(pred_adv_logit[0], ori_label, target_label, delta) l2, _, _, _, _, _, _ = _pert_loss(pred_adv_logit[1], ori_label, target_label, delta) loss, target, real, other, other_class, second_logit, second_class = _pert_loss( pred_adv_logit[2], ori_label, target_label, delta) num_query += 3 est_deriv = (l1 - l2) / (fd_eta * exploration * exploration) est_grad = est_deriv.item() * exp_noise prior += online_lr * est_grad original_image = original_image - flow_lr * prior.sign() delta = original_image_ - original_image tmp_norm = norm2(delta) original_image = torch.max( torch.min(original_image, original_image_ + 0.03), original_image_ - 0.03) original_image = torch.clamp(original_image, 0, 1) pred_adv_label = pred_adv_logit[2].argmax() if (loop % 1000 == 0) or (loop == max_query) or pred_adv_label != ori_label: #if (loop % 1000 ==0) or (loop == max_query) or pred_adv_label == target_label: print('[T2]{:.3f}s for [{}]-th loop\t' 'Queries {:03d}\t' 'Overall loss {:.3f}\t' 'est_deriv {:.3f}\t' 'Target {}\t' 'Target logit {:.3f}\t' 'ori logit {:.3f}\t' 'ori class {}\t' 'second logit {:.3f}\t' 'second class {}\t'.format(time.time() - start1, loop, num_query, loss, est_deriv.item(), target, real, other, other_class, second_logit, second_class)) loop += 1 if pred_adv_label != ori_label: #if pred_adv_label == target_label: #print('Predicted label is {}\t'.format(pred_adv_label)) diff = adv_img[2] - original_image_ print('diff max {:.3f}, diff min {:.3f}'.format( diff.max(), diff.min())) success = True #save_images(num_frame, original_image_.cpu().permute(0,2,3,1).numpy(), adv_img[2].cpu().permute(0,2,3,1).numpy(), save_path) break if num_query >= max_query: #save_images(num_frame, original_image_.cpu().permute(0,2,3,1).numpy(), adv_img[2].cpu().permute(0,2,3,1).numpy(), save_path) break return pred_adv_label, num_query, success
def make_dataset(opt, root_path, annotation_path, subset, n_samples_for_each_video, frames_sequence): data = load_annotation_data(annotation_path) #data = torch.load(annotation_path) """ new_dict = {} for key,value in data.items(): if ' ' in key: print(key) oldkey = key key.replace(' ','_') new_dict[key] = value torch.save(new_dict,"kinetics_annotation_full.json") stop """ video_names, annotations = get_video_names_and_annotations(data, subset) class_to_idx = get_class_labels(data) idx_to_class = {} for name, label in class_to_idx.items(): idx_to_class[label] = name dataset = [] total_number_of_clips = 0 gflop_per_clip = 8.5 for i in range(len(video_names)): if i % 1000 == 0: print('dataset loading [{}/{}]'.format(i, len(video_names))) if ' ' in video_names[i]: video_names[i] = video_names[i].replace(' ', '_', 1) #print(video_names[i]) if subset == "training": video_path = os.path.join(opt.video_path, video_names[i]) + ".mp4" if not os.path.exists(video_path): continue elif subset == "validation": #root_path="/kinetics2/kinetics2/kinetics_val_reencode" video_path = os.path.join(opt.video_path, video_names[i]) + ".mp4" if not os.path.exists(video_path): continue n_frames = get_num_frames(video_path) if n_frames <= 0: continue sample = { 'video': video_path, 'n_frames': n_frames, 'video_id': video_names[i] } if len(annotations) != 0: sample['label'] = class_to_idx[annotations[i]['label']] else: sample['label'] = -1 if n_samples_for_each_video == 1: sample['frame_indices'] = list(range(1, n_frames + 1)) dataset.append(sample) else: if n_samples_for_each_video > 1: step = max( 1, math.ceil((n_frames - 1 - frames_sequence) / (n_samples_for_each_video - 1))) else: if subset == 'training': step = int(sample_duration / 2) else: step = int(sample_duration / 2) for j in range(1, n_frames, step): sample_j = copy.deepcopy(sample) sample_j['frame_indices'] = list( range(j, min(n_frames + 1, j + frames_sequence))) total_number_of_clips += 1 dataset.append(sample_j) if n_samples_for_each_video == 0: num_of_videos = len(video_names) avg_clips_per_video = round(total_number_of_clips / num_of_videos, 2) print("Number of videos:", num_of_videos) print("Number of clips:", total_number_of_clips) print("Avarage amount of clips per video:", avg_clips_per_video) print( "Number of GFlos for each video in avarage will be,(true to mfnet only):", gflop_per_clip * avg_clips_per_video) return dataset, idx_to_class
def show_boxes_in_compressed_video(video_path, update_ms=10, min_confidence=0.0, box_file_path=None, min_frame_idx=None, max_frame_idx=None, frame_interval=1, frame_type=0, accumulate=False): """ This function show box :param box_file_path: string, the path of the boxes. The format of this file should be the same with MOTChallenge det.txt or gt.txt. :param video_path: string, the path of frames. :param update_ms: scalar, 1000 / update_ms is the fps, default 10 :param min_confidence: float, the confidence threshold of detection, the boxes with smaller confidence will not be displayed. Default 0.0 :param min_frame_idx: integer, the first frame to display, default the first frame of this sequence :param max_frame_idx: integer, the last frame to display, default the last frame of this sequence :param frame_interval: the interval to show frames :param frame_type: int, can be 0, 1 or 2 (denotes I frame, motion vector, residual, respectively) :param accumulate: used for the motion vector and residual. If it is true, the motion vector and residual are accumulated. :return: None """ def frame_callback(vis, frame_idx): # print('Processing frame: ', frame_idx) # Load image and generate detections. # Update visualization. GROUP_SIZE = 12 # the number of frames in one group. We set to 12 for the raw mpeg4 video. gop_idx = int( (frame_idx - 1) / GROUP_SIZE ) # GOP starts from 0, while frame_idx here starts from 1. in_group_idx = int( (frame_idx - 1) % GROUP_SIZE) # the index in the group image = coviar.load(video_path, gop_idx, in_group_idx, frame_type, accumulate) image = compressed_frame_to_show(image, frame_type, tool_type='cv2') vis.set_image(image.copy(), frame_idx) raw_box = seq_info['boxes'] if raw_boxes is not None: index = raw_box[:, 0] == frame_idx box = raw_box[index] index = box[:, 6] >= min_confidence box = box[index] box = box[:, 1:7] # [target_id, x, y, w, h] box_list = [] for idx in range(box.shape[0]): box_list.append(box[idx, :]) vis.draw_box(box_list) total_frames = coviar.get_num_frames(video_path) + 1 # get the first and las frame index if min_frame_idx is None: min_frame_idx = 1 if min_frame_idx < 0 or min_frame_idx > total_frames: min_frame_idx = 1 if max_frame_idx is None: max_frame_idx = total_frames if max_frame_idx < 0 or max_frame_idx > total_frames: max_frame_idx = total_frames if min_frame_idx > max_frame_idx: raise RuntimeError('The first frame index ', min_frame_idx, ' is larger than the last frame index ', max_frame_idx) # get the sequence information im = coviar.load(video_path, 0, 0, 0, False) im_size = im.shape raw_boxes = None if box_file_path is None else np.loadtxt( box_file_path, dtype=float, delimiter=',') seq_info = { 'image_size': [im_size[0], im_size[1]], 'min_frame_idx': min_frame_idx, 'max_frame_idx': max_frame_idx, 'frame_interval': frame_interval, 'boxes': raw_boxes, 'sequence_name': '' } visualizer = Visualization(seq_info, update_ms) visualizer.run(frame_callback)