def __init__(self, opt, train=True): super(HurricaneVideoDataset, self).__init__(opt, train=train) self.nc = 3 if self.opt.dataset == "hurricane" else 6 if self.train: self.image_path = os.path.join('./dataset/Hurricane/', 'train') else: self.image_path = os.path.join('./dataset/Hurricane/', 'test') threshold = self.window_size if opt.irregular else self.sample_size self.image_list = remove_files_under_sample_size( image_path=self.image_path, threshold=threshold) self.image_list = sorted(self.image_list) vtrans = [vtransforms.Pad(padding=(1, 0), fill=0)] if self.train: # vtrans += [vtransforms.RandomHorizontalFlip()] # vtrans += [vtransforms.RandomRotation()] pass vtrans += [vtransforms.ToTensor(scale=False)] vtrans += [vtransforms.Normalize(0.5, 0.5)] if opt.input_norm else [] self.vtrans = T.Compose(vtrans)
def VideoSpatialPrediction(vid_name, target, net, num_categories, num_samples=25, new_size=299, batch_size=2): gc = GradCAM(model=net) clip_mean = [0.5] * num_samples clip_std = [0.226] * num_samples normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) deep = 1 # inception = 299,299, resnet = 224,224 dims = (new_size, new_size, deep, num_samples) rgb = np.zeros(shape=dims, dtype=np.float64) rgb_flip = np.zeros(shape=dims, dtype=np.float64) for i in range(num_samples): img_file = os.path.join(vid_name, 'vr_{0:02d}.png'.format(i)) img = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) rgb[:, :, 0, i] = img rgb_flip[:, :, 0, i] = img[:, ::-1] _, _, _, c = rgb.shape rgb_list = [] for c_index in range(c): cur_img = rgb[:, :, :, c_index] cur_img_tensor = val_transform(cur_img) rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0)) rgb_np = np.concatenate(rgb_list, axis=0) prediction = np.zeros((num_categories, rgb.shape[3])) index = 50 input_data = rgb_np[index:index + 1, :, :, :] imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda() imgDataVar = torch.autograd.Variable(imgDataTensor) probs, ids = gc.forward(imgDataVar) ids_ = torch.LongTensor([[target]] * len(imgDataVar)).to( torch.device("cuda")) gc.backward(ids=ids_) regions = gc.generate(target_layer="Mixed_7c") save_gradcam(vid_name.split("/")[-1] + ".png", gcam=regions[0, 0], raw_image=rgb[:, :, :, index]) return prediction
def VideoSpatialPrediction( vid_name, net, num_categories, num_samples=25, new_size = 299, batch_size = 2 ): clip_mean = [0.5]*num_samples clip_std = [0.226]*num_samples normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) deep = 1 # inception = 299,299, resnet = 224,224 dims = (new_size,new_size,deep,num_samples) rgb = np.zeros(shape=dims, dtype=np.float64) rgb_flip = np.zeros(shape=dims, dtype=np.float64) for i in range(num_samples): img_file = os.path.join(vid_name, 'vr_{0:02d}.png'.format(i)) img = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) rgb[:,:,0,i] = img rgb_flip[:,:,0,i] = img[:,::-1] _, _, _, c = rgb.shape rgb_list = [] for c_index in range(c): cur_img = rgb[:,:,:,c_index] cur_img_tensor = val_transform(cur_img) rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0)) rgb_np = np.concatenate(rgb_list,axis=0) prediction = np.zeros((num_categories,rgb.shape[3])) num_batches = int(math.ceil(float(rgb.shape[3])/batch_size)) for bb in range(num_batches): span = range(batch_size*bb, min(rgb.shape[3],batch_size*(bb+1))) input_data = rgb_np[span,:,:,:] imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda() imgDataVar = torch.autograd.Variable(imgDataTensor) output = net(imgDataVar) result = output.data.cpu().numpy() prediction[:, span] = np.transpose(result) return prediction
def get_video_transform(data_name, split_name, opt): normalizer = video_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) t_list = [] if split_name == 'train': t_list = [ video_transforms.RandomResizedCrop(opt.crop_size), video_transforms.RandomHorizontalFlip(), video_transforms.ColorJitter(brightness=0.1, contrast=0.1, hue=0.1) ] else: t_list = [ video_transforms.Resize(256), video_transforms.CenterCrop(opt.crop_size) ] t_end = [video_transforms.ToTensor(), normalizer] transform = video_transforms.Compose(t_list + t_end) return transform
def __init__(self, opt, train=True): super(VideoDataset, self).__init__(opt, train=train) # Dataroot & Transform if opt.dataset == 'mgif': data_root = './dataset/moving-gif' vtrans = [vtransforms.Scale(size=128)] elif opt.dataset == 'kth': data_root = './dataset/kth_action/' vtrans = [ vtransforms.CenterCrop(size=120), vtransforms.Scale(size=128) ] elif opt.dataset == 'penn': data_root = './dataset/penn_action/' vtrans = [vtransforms.Scale(size=128)] if self.train: vtrans += [vtransforms.RandomHorizontalFlip()] vtrans += [vtransforms.RandomRotation()] vtrans += [vtransforms.ToTensor(scale=True)] vtrans += [vtransforms.Normalize(0.5, 0.5)] if opt.input_norm else [] self.vtrans = T.Compose(vtrans) if self.train: self.image_path = os.path.join(data_root, 'train') else: self.image_path = os.path.join(data_root, 'test') threshold = self.window_size if opt.irregular else self.sample_size if opt.dataset in ['kth', 'sintel', 'ucf101', 'penn']: self.image_list = os.listdir(self.image_path) elif opt.dataset in ['mgif', 'stickman']: self.image_list = remove_files_under_sample_size( image_path=self.image_path, threshold=threshold) self.image_list = sorted(self.image_list)
def VideoSpatialPrediction(mode, vid_name, net, num_categories, start_frame=0, num_frames=0, num_samples=25, index=1, new_size=299): if num_frames == 0: imglist = os.listdir(vid_name) #imglist = list(filter(lambda x: x[:3]=='img',imglist)) duration = len(imglist) # print(duration) else: duration = num_frames # selection if mode == 'rgb': step = int(math.floor((duration - 1) / (num_samples - 1))) clip_mean = [0.485, 0.456, 0.406] clip_std = [0.229, 0.224, 0.225] else: clip_mean = [0.5, 0.5] clip_std = [0.226, 0.226] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) deep = 1 if mode == 'rhythm' else 3 # inception = 320,360, resnet = 240, 320 width = 320 if new_size == 299 else 240 height = 360 if new_size == 299 else 320 dims = (width, height, deep, num_samples) rgb = np.zeros(shape=dims, dtype=np.float64) rgb_flip = np.zeros(shape=dims, dtype=np.float64) for i in range(num_samples): if mode == 'rhythm': img_file = os.path.join(vid_name, 'visual_rhythm_{0:05d}.png'.format(index)) print(img_file) img = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) img = cv2.resize(img, dims[1::-1]) rgb[:, :, 0, i] = img rgb_flip[:, :, 0, i] = img[:, ::-1] else: img_file = os.path.join(vid_name, 'img_{0:05d}.jpg'.format(i * step + 1)) img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED) img = cv2.resize(img, dims[1::-1]) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) rgb[:, :, :, i] = img rgb_flip[:, :, :, i] = img[:, ::-1, :] # crop 299 = inception, 224 = resnet size = new_size corner = [(height - size) // 2, (width - size) // 2] rgb_1 = rgb[:size, :size, :, :] rgb_2 = rgb[:size, -size:, :, :] rgb_3 = rgb[corner[1]:corner[1] + size, corner[0]:corner[0] + size, :, :] rgb_4 = rgb[-size:, :size, :, :] rgb_5 = rgb[-size:, -size:, :, :] rgb_f_1 = rgb_flip[:size, :size, :, :] rgb_f_2 = rgb_flip[:size, -size:, :, :] rgb_f_3 = rgb_flip[corner[1]:corner[1] + size, corner[0]:corner[0] + size, :, :] rgb_f_4 = rgb_flip[-size:, :size, :, :] rgb_f_5 = rgb_flip[-size:, -size:, :, :] rgb = np.concatenate((rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_f_1, rgb_f_2, rgb_f_3, rgb_f_4, rgb_f_5), axis=3) _, _, _, c = rgb.shape rgb_list = [] for c_index in range(c): cur_img = rgb[:, :, :, c_index] cur_img_tensor = val_transform(cur_img) rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0)) rgb_np = np.concatenate(rgb_list, axis=0) #batch_size = 25 batch_size = 5 prediction = np.zeros((num_categories, rgb.shape[3])) num_batches = int(math.ceil(float(rgb.shape[3]) / batch_size)) for bb in range(num_batches): span = range(batch_size * bb, min(rgb.shape[3], batch_size * (bb + 1))) input_data = rgb_np[span, :, :, :] imgDataTensor = torch.from_numpy(input_data).type( torch.FloatTensor).cuda() imgDataVar = torch.autograd.Variable(imgDataTensor) output = net(imgDataVar) result = output.data.cpu().numpy() prediction[:, span] = np.transpose(result) return prediction
def VideoSpatialPrediction3D(vid_name, net, num_categories, architecture_name, start_frame=0, num_frames=0, length=16, extension='img_{0:05d}.jpg', ten_crop=False): if num_frames == 0: imglist = os.listdir(vid_name) newImageList = [] if 'rgb' in architecture_name or 'pose' in architecture_name: for item in imglist: if 'img' in item: newImageList.append(item) elif 'flow' in architecture_name: for item in imglist: if 'flow_x' in item: newImageList.append(item) duration = len(newImageList) else: duration = num_frames if 'rgb' in architecture_name or 'pose' in architecture_name: if 'I3D' in architecture_name: if not 'resnet' in architecture_name: clip_mean = [0.5, 0.5, 0.5] clip_std = [0.5, 0.5, 0.5] else: clip_mean = [0.45, 0.45, 0.45] clip_std = [0.225, 0.225, 0.225] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) if '112' in architecture_name: scale = 0.5 else: scale = 1 elif 'MFNET3D' in architecture_name: clip_mean = [0.48627451, 0.45882353, 0.40784314] clip_std = [0.234, 0.234, 0.234] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose( [video_transforms.ToTensor(), normalize]) if '112' in architecture_name: scale = 0.5 else: scale = 1 elif 'tsm' in architecture_name: clip_mean = [0.485, 0.456, 0.406] clip_std = [0.229, 0.224, 0.225] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose( [video_transforms.ToTensor(), normalize]) scale = 1 elif "r2plus1d" in architecture_name: clip_mean = [0.43216, 0.394666, 0.37645] clip_std = [0.22803, 0.22145, 0.216989] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose( [video_transforms.ToTensor(), normalize]) scale = 0.5 elif 'rep_flow' in architecture_name: clip_mean = [0.5, 0.5, 0.5] clip_std = [0.5, 0.5, 0.5] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) scale = 1 elif "slowfast" in architecture_name: clip_mean = [0.45, 0.45, 0.45] clip_std = [0.225, 0.225, 0.225] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) scale = 1 else: scale = 0.5 clip_mean = [114.7748, 107.7354, 99.4750] clip_std = [1, 1, 1] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor2(), normalize, ]) elif 'flow' in architecture_name: if 'I3D' in architecture_name: clip_mean = [0.5] * 2 clip_std = [0.5] * 2 normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) scale = 1 elif "3D" in architecture_name: scale = 0.5 clip_mean = [127.5, 127.5] clip_std = [1, 1] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor2(), normalize, ]) elif "r2plus1d" in architecture_name: clip_mean = [0.5] * 2 clip_std = [0.226] * 2 normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) scale = 0.5 if '224' in architecture_name: scale = 1 if '112' in architecture_name: scale = 0.5 # selection #step = int(math.floor((duration-1)/(num_samples-1))) dims2 = (224, 224, 3, duration) imageSize = int(224 * scale) dims = (int(256 * scale), int(340 * scale), 3, duration) #dims = (int(256 * scale),int(256 * scale),3,duration) duration = duration - 1 offsets = [] offsetMainIndexes = list(range(1, duration - length, length)) if len(offsetMainIndexes) == 0: offsets = list(range(1, duration + 2)) * int( np.floor(length / (duration + 1))) + list( range(1, length % (duration + 1) + 1)) else: shift = int((duration - (offsetMainIndexes[-1] + length)) / 2) for mainOffsetValue in offsetMainIndexes: for lengthID in range(1, length + 1): offsets.append(lengthID + mainOffsetValue + shift) # offsetMainIndexes = list(range(0,duration,length)) # for mainOffsetValue in offsetMainIndexes: # for lengthID in range(1, length+1): # loaded_frame_index = lengthID + mainOffsetValue # moded_loaded_frame_index = loaded_frame_index % (duration + 1) # if moded_loaded_frame_index == 0: # moded_loaded_frame_index = (duration + 1) # offsets.append(moded_loaded_frame_index) imageList = [] imageList1 = [] imageList2 = [] imageList3 = [] imageList4 = [] imageList5 = [] imageList6 = [] imageList7 = [] imageList8 = [] imageList9 = [] imageList10 = [] imageList11 = [] imageList12 = [] interpolation = cv2.INTER_LINEAR for index in offsets: if 'rgb' in architecture_name or 'pose' in architecture_name: img_file = os.path.join(vid_name, extension.format(index)) img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED) img = cv2.resize(img, dims[1::-1], interpolation) #img2 = cv2.resize(img, dims2[1::-1],interpolation) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img_flip = img[:, ::-1, :].copy() elif 'flow' in architecture_name: flow_x_file = os.path.join(vid_name, extension.format('x', index)) flow_y_file = os.path.join(vid_name, extension.format('y', index)) img_x = cv2.imread(flow_x_file, cv2.IMREAD_GRAYSCALE) img_y = cv2.imread(flow_y_file, cv2.IMREAD_GRAYSCALE) img_x = np.expand_dims(img_x, -1) img_y = np.expand_dims(img_y, -1) img = np.concatenate((img_x, img_y), 2) img = cv2.resize(img, dims[1::-1], interpolation) img_flip = img[:, ::-1, :].copy() #img_flip2 = img2[:,::-1,:].copy() #imageList1.append(img[int(16 * scale):int(16 * scale + imageSize), int(16 * scale) : int(16 * scale + imageSize), :]) imageList1.append(img[int(16 * scale):int(16 * scale + imageSize), int(58 * scale):int(58 * scale + imageSize), :]) imageList2.append(img[:imageSize, :imageSize, :]) imageList3.append(img[:imageSize, -imageSize:, :]) imageList4.append(img[-imageSize:, :imageSize, :]) imageList5.append(img[-imageSize:, -imageSize:, :]) imageList6.append(img_flip[int(16 * scale):int(16 * scale + imageSize), int(58 * scale):int(58 * scale + imageSize), :]) imageList7.append(img_flip[:imageSize, :imageSize, :]) imageList8.append(img_flip[:imageSize, -imageSize:, :]) imageList9.append(img_flip[-imageSize:, :imageSize, :]) imageList10.append(img_flip[-imageSize:, -imageSize:, :]) # imageList11.append(img2) # imageList12.append(img_flip2) if ten_crop: imageList = imageList1 + imageList2 + imageList3 + imageList4 + imageList5 + imageList6 + imageList7 + imageList8 + imageList9 + imageList10 else: imageList = imageList1 #imageList=imageList11+imageList12 rgb_list = [] for i in range(len(imageList)): cur_img = imageList[i] cur_img_tensor = val_transform(cur_img) rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0)) input_data = np.concatenate(rgb_list, axis=0) if 'rgb' in architecture_name or 'pose' in architecture_name: input_data = input_data.reshape(-1, length, 3, imageSize, imageSize) elif 'flow' in architecture_name: input_data = input_data.reshape(-1, length, 2, imageSize, imageSize) batch_size = 10 result = np.zeros([input_data.shape[0], num_categories]) num_batches = int(math.ceil(float(input_data.shape[0]) / batch_size)) with torch.no_grad(): for bb in range(num_batches): span = range(batch_size * bb, min(input_data.shape[0], batch_size * (bb + 1))) input_data_batched = input_data[span, :, :, :, :] imgDataTensor = torch.from_numpy(input_data_batched).type( torch.FloatTensor).cuda() if 'rgb' in architecture_name or 'pose' in architecture_name: imgDataTensor = imgDataTensor.view(-1, length, 3, imageSize, imageSize).transpose(1, 2) elif 'flow' in architecture_name: imgDataTensor = imgDataTensor.view(-1, length, 2, imageSize, imageSize).transpose(1, 2) if 'bert' in architecture_name or 'pooling' in architecture_name or 'NLB' in architecture_name \ or 'lstm' in architecture_name or 'adamw' in architecture_name: output, input_vectors, sequenceOut, maskSample = net( imgDataTensor) else: output = net(imgDataTensor) #span = range(sample_size*bb, min(int(input_data.shape[0]/length),sample_size*(bb+1))) result[span, :] = output.data.cpu().numpy() mean_result = np.mean(result, 0) prediction = np.argmax(mean_result) top3 = mean_result.argsort()[::-1][:3] top5 = mean_result.argsort()[::-1][:5] return prediction, mean_result, top3
def VideoSpatialPrediction3D_bert( vid_name, net, num_categories, architecture_name, start_frame=0, num_frames=0, num_seg=4, length = 16, extension = 'img_{0:05d}.jpg', ten_crop = False ): if num_frames == 0: imglist = os.listdir(vid_name) newImageList=[] if 'rgb' in architecture_name or 'pose' in architecture_name: for item in imglist: if 'img' in item: newImageList.append(item) elif 'flow' in architecture_name: for item in imglist: if 'flow_x' in item: newImageList.append(item) duration = len(newImageList) else: duration = num_frames if 'rgb' in architecture_name: if 'I3D' in architecture_name: if not 'resnet' in architecture_name: clip_mean = [0.5, 0.5, 0.5] clip_std = [0.5, 0.5, 0.5] else: clip_mean = [0.45, 0.45, 0.45] clip_std = [0.225, 0.225, 0.225] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) if '112' in architecture_name: scale = 0.5 else: scale = 1 elif 'MFNET3D' in architecture_name: clip_mean = [0.48627451, 0.45882353, 0.40784314] clip_std = [0.234, 0.234, 0.234] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize]) if '112' in architecture_name: scale = 0.5 else: scale = 1 elif 'tsm' in architecture_name: clip_mean = [0.485, 0.456, 0.406] clip_std = [0.229, 0.224, 0.225] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize]) scale = 1 elif "r2plus1d" in architecture_name: clip_mean = [0.43216, 0.394666, 0.37645] clip_std = [0.22803, 0.22145, 0.216989] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize]) scale = 0.5 elif 'rep_flow' in architecture_name: clip_mean = [0.5, 0.5, 0.5] clip_std = [0.5, 0.5, 0.5] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) scale = 1 elif "slowfast" in architecture_name: clip_mean = [0.45, 0.45, 0.45] clip_std = [0.225, 0.225, 0.225] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) scale = 1 else: scale = 0.5 clip_mean = [114.7748, 107.7354, 99.4750] clip_std = [1, 1, 1] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor2(), normalize, ]) elif 'flow' in architecture_name: if 'I3D' in architecture_name: clip_mean = [0.5] * 2 clip_std = [0.5] * 2 normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) scale = 1 else: scale = 0.5 clip_mean = [127.5, 127.5] clip_std = [1, 1] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor2(), normalize, ]) # selection #step = int(math.floor((duration-1)/(num_samples-1))) if '224' in architecture_name: scale = 1 if '112' in architecture_name: scale = 0.5 imageSize=int(224 * scale) dims = (int(256 * scale),int(340 * scale),3,duration) duration = duration - 1 average_duration = int(duration / num_seg) offsetMainIndexes = [] offsets = [] for seg_id in range(num_seg): if average_duration >= length: offsetMainIndexes.append(int((average_duration - length + 1)/2 + seg_id * average_duration)) elif duration >=length: average_part_length = int(np.floor((duration-length)/num_seg)) offsetMainIndexes.append(int((average_part_length*(seg_id) + average_part_length*(seg_id+1))/2)) else: increase = int(duration / num_seg) offsetMainIndexes.append(0 + seg_id * increase) for mainOffsetValue in offsetMainIndexes: for lengthID in range(1, length+1): loaded_frame_index = lengthID + mainOffsetValue moded_loaded_frame_index = loaded_frame_index % (duration + 1) if moded_loaded_frame_index == 0: moded_loaded_frame_index = (duration + 1) offsets.append(moded_loaded_frame_index) imageList=[] imageList1=[] imageList2=[] imageList3=[] imageList4=[] imageList5=[] imageList6=[] imageList7=[] imageList8=[] imageList9=[] imageList10=[] imageList11=[] imageList12=[] interpolation = cv2.INTER_LINEAR for index in offsets: if 'rgb' in architecture_name or 'pose' in architecture_name: img_file = os.path.join(vid_name, extension.format(index)) img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED) img = cv2.resize(img, dims[1::-1],interpolation) #img2 = cv2.resize(img, dims2[1::-1],interpolation) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) img_flip = img[:,::-1,:].copy() elif 'flow' in architecture_name: flow_x_file = os.path.join(vid_name, extension.format('x',index)) flow_y_file = os.path.join(vid_name, extension.format('y',index)) img_x = cv2.imread(flow_x_file, cv2.IMREAD_GRAYSCALE) img_y = cv2.imread(flow_y_file, cv2.IMREAD_GRAYSCALE) img_x = np.expand_dims(img_x,-1) img_y = np.expand_dims(img_y,-1) img = np.concatenate((img_x,img_y),2) img = cv2.resize(img, dims[1::-1],interpolation) img_flip = img[:,::-1,:].copy() #img_flip2 = img2[:,::-1,:].copy() imageList1.append(img[int(16 * scale):int(16 * scale + imageSize), int(58 * scale) : int(58 * scale + imageSize), :]) imageList2.append(img[:imageSize, :imageSize, :]) imageList3.append(img[:imageSize, -imageSize:, :]) imageList4.append(img[-imageSize:, :imageSize, :]) imageList5.append(img[-imageSize:, -imageSize:, :]) imageList6.append(img_flip[int(16 * scale):int(16 * scale + imageSize), int(58 * scale) : int(58 * scale + imageSize), :]) imageList7.append(img_flip[:imageSize, :imageSize, :]) imageList8.append(img_flip[:imageSize, -imageSize:, :]) imageList9.append(img_flip[-imageSize:, :imageSize, :]) imageList10.append(img_flip[-imageSize:, -imageSize:, :]) # imageList11.append(img2) # imageList12.append(img_flip2) if ten_crop: imageList=imageList1+imageList2+imageList3+imageList4+imageList5+imageList6+imageList7+imageList8+imageList9+imageList10 else: imageList=imageList1 #imageList=imageList11+imageList12 rgb_list=[] for i in range(len(imageList)): cur_img = imageList[i] cur_img_tensor = val_transform(cur_img) rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0)) input_data=np.concatenate(rgb_list,axis=0) with torch.no_grad(): imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda() if 'rgb' in architecture_name or 'pose' in architecture_name: imgDataTensor = imgDataTensor.view(-1,length,3,imageSize,imageSize).transpose(1,2) elif 'flow' in architecture_name: imgDataTensor = imgDataTensor.view(-1,length,2,imageSize,imageSize).transpose(1,2) if 'bert' in architecture_name or 'pooling' in architecture_name: output, input_vectors, sequenceOut, maskSample = net(imgDataTensor) else: output = net(imgDataTensor) # outputSoftmax=soft(output) result = output.data.cpu().numpy() mean_result=np.mean(result,0) prediction=np.argmax(mean_result) top3 = mean_result.argsort()[::-1][:3] return prediction, mean_result, top3
def main(args): global best_prec1, best_loss input_size = int(224 * args.scale) width = int(340 * args.scale) height = int(256 * args.scale) if not os.path.exists(args.savelocation): os.makedirs(args.savelocation) now = time.time() savelocation = os.path.join(args.savelocation, str(now)) os.makedirs(savelocation) logging.basicConfig(filename=os.path.join(savelocation, "log.log"), level=logging.INFO) model = build_model(args.arch, args.pre, args.num_seg, args.resume) optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss().cuda() criterion2 = nn.MSELoss().cuda() scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, verbose=True) # if args.dataset=='sign': # dataset="/data/AUTSL/train_img_c" # elif args.dataset=="signd": # dataset="/data/AUTSL/train_img_c" # elif args.dataset=="customd": # dataset="/data/AUTSL/train_img_c" # else: # print("no dataset") # return 0 cudnn.benchmark = True length = 64 scale_ratios = [1.0, 0.875, 0.75, 0.66] clip_mean = [0.43216, 0.394666, 0.37645] * args.num_seg * length clip_std = [0.22803, 0.22145, 0.216989] * args.num_seg * length normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) train_transform = video_transforms.Compose([ video_transforms.CenterCrop(input_size), video_transforms.ToTensor2(), normalize, ]) val_transform = video_transforms.Compose([ video_transforms.CenterCrop((input_size)), video_transforms.ToTensor2(), normalize, ]) # test_transform = video_transforms.Compose([ # video_transforms.CenterCrop((input_size)), # video_transforms.ToTensor2(), # normalize, # ]) # test_file = os.path.join(args.datasetpath, args.testlist) if not os.path.exists(args.trainlist) or not os.path.exists(args.vallist): print( "No split file exists in %s directory. Preprocess the dataset first" % (args.datasetpath)) train_dataset = datasets.__dict__[args.dataset]( root=args.datasetpath, source=args.trainlist, phase="train", modality="rgb", is_color=True, new_length=length, new_width=width, new_height=height, video_transform=train_transform, num_segments=args.num_seg) val_dataset = datasets.__dict__[args.dataset]( root=args.datasetpath, source=args.vallist, phase="val", modality="rgb", is_color=True, new_length=length, new_width=width, new_height=height, video_transform=val_transform, num_segments=args.num_seg) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) best_prec1 = 0 for epoch in range(0, args.epochs): train(length, input_size, train_loader, model, criterion, criterion2, optimizer, epoch) if (epoch + 1) % args.save_freq == 0: is_best = False prec1, prec3, lossClassification = validate( length, input_size, val_loader, model, criterion, criterion2) scheduler.step(lossClassification) if prec1 >= best_prec1: is_best = True best_prec1 = prec1 checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint.pth.tar") text = "save checkpoint {}".format(checkpoint_name) print(text) logging.info(text) save_checkpoint( { "epoch": epoch + 1, "arch": args.arch, "state_dict": model.state_dict(), "prec1": prec1, "optimizer": optimizer.state_dict() }, is_best, checkpoint_name, savelocation)
def main(): global args args = parser.parse_args() if args.dataset == 'ucf101': num_categories = 101 elif args.dataset == 'hmdb51': num_categories = 51 elif args.dataset == 'kinetics': num_categories = 400 else: raise ValueError('Unknown dataset ' + args.dataset) start_frame = 0 model_start_time = time.time() params = torch.load(args.weights) #hard code net = models.__dict__[args.arch](pretrained=False, num_classes=num_categories) net.load_state_dict(params['state_dict']) net.cuda() net.eval() model_end_time = time.time() model_time = model_end_time - model_start_time print("Action recognition model is loaded in %4.4f seconds." % (model_time)) ### if args.modality == "rgb": new_length = 1 is_color = True scale_ratios = [1.0, 0.875, 0.75, 0.66] clip_mean = [0.485, 0.456, 0.406] * new_length clip_std = [0.229, 0.224, 0.225] * new_length elif args.modality == "flow": new_length = 10 is_color = False scale_ratios = [1.0, 0.875, 0.75] clip_mean = [0.5, 0.5] * new_length clip_std = [0.226, 0.226] * new_length normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.GroupCenterCrop(net.input_size), video_transforms.CenterCrop((224)), video_transforms.ToTensor(), normalize, ]) dataset = datasets.load_clip(root=args.data, source=args.test_list, phase="val", modality=args.modality, is_color=is_color, new_length=new_length, new_width=args.new_width, new_height=args.new_height, video_transform=val_transform) data_loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=args.workers * 2, pin_memory=True) data_gen = enumerate(data_loader) total_num = len(data_loader.dataset) output = [] def eval_video(video_data): i, data, label = video_data num_crop = 1 if args.modality == 'rgb': length = 3 elif args.modality == 'flow': length = 10 elif args.modality == 'RGBDiff': length = 18 else: raise ValueError("Unknown modality " + args.modality) input_var = torch.autograd.Variable(data.view(-1, length, data.size(2), data.size(3)), volatile=True) input_var = input_var.type(torch.FloatTensor).cuda() rst = net(input_var).data.cpu().numpy().copy() return i, rst.reshape( (num_crop, args.test_segments, num_categories)).mean(axis=0).reshape( (args.test_segments, 1, num_categories)), label[0] proc_start_time = time.time() max_num = args.max_num if args.max_num > 0 else len(data_loader.dataset) for i, (data, label) in data_gen: data = data.float().cuda(async=True) label = label.cuda(async=True) input_var = torch.autograd.Variable(data, volatile=True) target_var = torch.autograd.Variable(label, volatile=True) rst = net(input_var).data.cpu().numpy() #avg_pred_fc8 = np.mean(rst, axis=1) # print(avg_spatial_pred_fc8.shape) #result_list.append(avg_pred_fc8) # avg_spatial_pred = softmax(avg_spatial_pred_fc8) pred_index = np.argmax(rst) #print (label.cpu().numpy()) #print (pred_index) # print(rst) # if i >= max_num: # break # rst = eval_video((i, data, label)) output.append(rst) if label.cpu().numpy()[0] == pred_index: match_count += 1 cnt_time = time.time() - proc_start_time print('video {} done, total {}/{}, average {} sec/video'.format( i, i + 1, total_num, float(cnt_time) / (i + 1))) print(match_count) print(total_num) print("Accuracy is %4.4f" % (float(match_count) / float(total_num))) np.save("{}_sX_{}_{}.npy".format(args.dataset, args.modality, args.arch), np.array(output)) #hard code
def VideoSpatialPrediction(mode, vid_name, target, net, num_categories, start_frame=0, num_frames=0, num_samples=25, index=1, new_size=299, ext=".jpg"): gc = GradCAM(model=net) if num_frames == 0: imglist = os.listdir(vid_name) duration = len(imglist) else: duration = num_frames # selection if mode == 'rgb': step = int(math.floor((duration - 1) / (num_samples - 1))) clip_mean = [0.485, 0.456, 0.406] clip_std = [0.229, 0.224, 0.225] else: clip_mean = [0.5, 0.5] clip_std = [0.226, 0.226] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) test_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) # inception = 320,360, resnet = 240, 320 width = 320 if new_size == 299 else 240 height = 360 if new_size == 299 else 320 deep = 1 if mode == 'rhythm' else 3 dims = (width, height, deep, num_samples) rgb = np.zeros(shape=dims, dtype=np.float64) rgb_flip = np.zeros(shape=dims, dtype=np.float64) for i in range(num_samples): if mode == 'rhythm': img_file = os.path.join( vid_name, 'visual_rhythm_{0:05d}{1}'.format(index, ext)) #print(img_file) img = cv2.imread(img_file, cv2.IMREAD_GRAYSCALE) img = cv2.resize(img, dims[1::-1]) rgb[:, :, 0, i] = img rgb_flip[:, :, 0, i] = img[:, ::-1] else: img_file = os.path.join(vid_name, 'img_{0:05d}{1}'.format(i * step + 1, ext)) img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED) img = cv2.resize(img, dims[1::-1]) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) rgb[:, :, :, i] = img rgb_flip[:, :, :, i] = img[:, ::-1, :] # crop 299 = inception, 224 = resnet size = new_size corner = [(height - size) // 2, (width - size) // 2] rgb_1 = rgb[:size, :size, :, :] rgb_2 = rgb[:size, -size:, :, :] rgb_3 = rgb[corner[1]:corner[1] + size, corner[0]:corner[0] + size, :, :] rgb_4 = rgb[-size:, :size, :, :] rgb_5 = rgb[-size:, -size:, :, :] rgb_f_1 = rgb_flip[:size, :size, :, :] rgb_f_2 = rgb_flip[:size, -size:, :, :] rgb_f_3 = rgb_flip[corner[1]:corner[1] + size, corner[0]:corner[0] + size, :, :] rgb_f_4 = rgb_flip[-size:, :size, :, :] rgb_f_5 = rgb_flip[-size:, -size:, :, :] rgb = np.concatenate((rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_f_1, rgb_f_2, rgb_f_3, rgb_f_4, rgb_f_5), axis=3) _, _, _, c = rgb.shape rgb_list = [] for c_index in range(c): cur_img = rgb[:, :, :, c_index] cur_img_tensor = test_transform(cur_img) rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0)) rgb_np = np.concatenate(rgb_list, axis=0) prediction = np.zeros((num_categories, rgb.shape[3])) index = 50 input_data = rgb_np[index:index + 1, :, :, :] imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda() imgDataVar = torch.autograd.Variable(imgDataTensor) probs, ids = gc.forward(imgDataVar) ids_ = torch.LongTensor([[target]] * len(imgDataVar)).to( torch.device("cuda")) gc.backward(ids=ids_) regions = gc.generate(target_layer="Mixed_7c") save_gradcam(vid_name.split("/")[-1] + ".png", gcam=regions[0, 0], raw_image=rgb[:, :, :, index]) return prediction
#define hyperparameters batch_size = 1 eval_freq = 1 # define transform import video_transforms scale = 1 input_size = int(224 * scale) width = int(340 * scale) height = int(256 * scale) length = 32 clip_mean = [0.5, 0.5, 0.5] * length clip_std = [0.5, 0.5, 0.5] * length scale_ratios = [1.0, 0.875, 0.75, 0.66] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) train_transform = video_transforms.Compose([ video_transforms.MultiScaleCrop((input_size, input_size), scale_ratios), video_transforms.RandomHorizontalFlip(), video_transforms.ToTensor(), normalize, ]) val_transform = video_transforms.Compose([ video_transforms.CenterCrop((input_size)), video_transforms.ToTensor(), normalize, ]) #define dataset and dataloader train_split_file = "./datasets/settings/haa500_instruments/train_rgb_split1.txt"
def VideoTemporalPrediction( mode, vid_name, net, num_categories, start_frame=0, num_frames=0, num_samples=25, optical_flow_frames=10, new_size = 299, ext = ".jpg" ): if num_frames == 0: imglist = os.listdir(vid_name) duration = len(imglist) else: duration = num_frames # selection step = int(math.floor((duration-optical_flow_frames+1)/num_samples)) clip_mean = [0.5] * optical_flow_frames * 2 clip_std = [0.226] * optical_flow_frames * 2 normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) test_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize ]) # inception = 320,360, resnet = 240, 320 width = 320 if new_size==299 else 240 height = 360 if new_size==299 else 320 deep = optical_flow_frames*2 dims = (width,height,deep,num_samples) flow = np.zeros(shape=dims, dtype=np.float64) flow_flip = np.zeros(shape=dims, dtype=np.float64) for i in range(num_samples): for j in range(optical_flow_frames): flow_x_file = os.path.join(vid_name, mode+'_x_{0:05d}{1}'.format(i*step+j+1 + start_frame, ext)) flow_y_file = os.path.join(vid_name, mode+'_y_{0:05d}{1}'.format(i*step+j+1 + start_frame, ext)) img_x = cv2.imread(flow_x_file, cv2.IMREAD_GRAYSCALE) img_y = cv2.imread(flow_y_file, cv2.IMREAD_GRAYSCALE) img_x = cv2.resize(img_x, dims[1::-1]) img_y = cv2.resize(img_y, dims[1::-1]) flow[:,:,j*2 ,i] = img_x flow[:,:,j*2+1,i] = img_y flow_flip[:,:,j*2 ,i] = 255 - img_x[:, ::-1] flow_flip[:,:,j*2+1,i] = img_y[:, ::-1] # crop 299 = inception, 224 = resnet size = new_size corner = [(height-size)//2, (width-size)//2] flow_1 = flow[:size, :size, :,:] flow_2 = flow[:size, -size:, :,:] flow_3 = flow[corner[1]:corner[1]+size, corner[0]:corner[0]+size, :,:] flow_4 = flow[-size:, :size, :,:] flow_5 = flow[-size:, -size:, :,:] flow_f_1 = flow_flip[:size, :size, :,:] flow_f_2 = flow_flip[:size, -size:, :,:] flow_f_3 = flow_flip[corner[1]:corner[1]+size, corner[0]:corner[0]+size, :,:] flow_f_4 = flow_flip[-size:, :size, :,:] flow_f_5 = flow_flip[-size:, -size:, :,:] flow = np.concatenate((flow_1,flow_2,flow_3,flow_4,flow_5,flow_f_1,flow_f_2,flow_f_3,flow_f_4,flow_f_5), axis=3) _, _, _, c = flow.shape flow_list = [] for c_index in range(c): cur_img = flow[:,:,:,c_index].squeeze() cur_img_tensor = test_transform(cur_img) flow_list.append(np.expand_dims(cur_img_tensor.numpy(), 0)) flow_np = np.concatenate(flow_list,axis=0) batch_size = 15 prediction = np.zeros((num_categories,flow.shape[3])) num_batches = int(math.ceil(float(flow.shape[3])/batch_size)) for bb in range(num_batches): span = range(batch_size*bb, min(flow.shape[3],batch_size*(bb+1))) input_data = flow_np[span,:,:,:] imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda() imgDataVar = torch.autograd.Variable(imgDataTensor) output = net(imgDataVar) result = output.data.cpu().numpy() prediction[:, span] = np.transpose(result) return prediction
def main(): global best_prec1 # create model print("Building model ... ") print("Building model ... ", file = f_log) model = build_model(resume_path = args.resume) print("Model %s is loaded. " % (args.arch)) print("Model %s is loaded. " % (args.arch), file = f_log) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) cudnn.benchmark = True # Data transforming if args.modality == "rgb": is_color = True scale_ratios = [1.0, 0.875, 0.75, 0.66] clip_mean = [0.485, 0.456, 0.406] clip_std = [0.229, 0.224, 0.225] elif args.modality == "tvl1_flow" or args.modality == "lk_flow": is_color = False scale_ratios = [1.0, 0.875, 0.75] clip_mean = [0.5, 0.5] clip_std = [0.226, 0.226] else: print("No such modality. Only rgb and flow supported.") print("No such modality. Only rgb and flow supported.", file = f_log) normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) train_transform = video_transforms.Compose([ #video_transforms.Scale((288)), video_transforms.MultiScaleCrop((256, 256), scale_ratios), video_transforms.RandomHorizontalFlip(), video_transforms.ToTensor(), normalize, ]) val_transform = video_transforms.Compose([ #video_transforms.Scale((288)), video_transforms.CenterCrop((256)), video_transforms.ToTensor(), normalize, ]) # data loading train_setting_file = "train_%s_split%d.txt" % (args.modality, args.split) train_split_file = os.path.join(args.settings, args.dataset, train_setting_file) val_setting_file = "val_%s_split%d.txt" % (args.modality, args.split) val_split_file = os.path.join(args.settings, args.dataset, val_setting_file) if not os.path.exists(train_split_file) or not os.path.exists(val_split_file): print("No split file exists in %s directory. Preprocess the dataset first" % (args.settings)) print("No split file exists in %s directory. Preprocess the dataset first" % (args.settings), file = f_log ) train_dataset = datasets.__dict__[args.dataset](setting=train_split_file, root=args.data, train=True, new_width=args.new_width, new_height=args.new_height, new_length=args.new_length, target_width=args.new_width, target_height=args.new_height, modality=args.modality, num_segments=args.num_segments, transform=train_transform, name_pattern='frame%06d.jpg') val_dataset = datasets.__dict__[args.dataset](setting=val_split_file, root=args.data, train=False, new_width=args.new_width, new_height=args.new_height, new_length=args.new_length, target_width=args.new_width, target_height=args.new_height, modality=args.modality, num_segments=args.num_segments, transform=val_transform, name_pattern='frame%06d.jpg') print('{} samples found, {} train samples and {} test samples.'.format(len(val_dataset)+len(train_dataset), len(train_dataset), len(val_dataset))) print('{} samples found, {} train samples and {} test samples.'.format(len(val_dataset)+len(train_dataset), len(train_dataset), len(val_dataset)), file = f_log ) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) #, pin_memory=True) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers) #, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch print("start epoch ", epoch) train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = 0.0 if (epoch + 1) % args.save_freq == 0: prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if (epoch + 1) % args.save_freq == 0: checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint.pth.tar") save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer' : optimizer.state_dict(), }, is_best, checkpoint_name, args.save_path)
def rgb_test(param_model): model = param_model f = open(video + "rgb_result.txt", 'w') video_list = os.listdir(video) for file in video_list: if file.endswith("mp4"): f.write(file + "\n") frame_count = 2 clip_mean = [0.485, 0.456, 0.406] * 1 clip_std = [0.229, 0.224, 0.225] * 1 normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) # config the transform to match the network's format transform = video_transforms.Compose([ # video_transforms.Scale((256)), video_transforms.CenterCrop((224)), video_transforms.ToTensor(), normalize, ]) # prepare the translation dictionary label-action data_handler = UCF101_splitter( os.getcwd() + '/datasets/ucf101_splits/', None) data_handler.get_action_index() class_to_idx = data_handler.action_label idx_to_class = {v: k for k, v in class_to_idx.items()} # Start looping on frames received from webcam vs = cv2.VideoCapture(video + file) softmax = torch.nn.Softmax() nn_output = torch.tensor(np.zeros((1, 23)), dtype=torch.float32).cuda() sampled_list = [] first_count = 0 while True: # read each frame and prepare it for feedforward in nn (resize and type) ret, orig_frame = vs.read() if ret is False: break else: orig_frame = cv2.resize(orig_frame, (342, 256), interpolation=cv2.INTER_LINEAR) frame = cv2.cvtColor(orig_frame, cv2.COLOR_BGR2RGB) frame = transform(frame).view(1, 3, 224, 224).cuda() frame = frame.float().cuda(async=True) # feed the frame to the neural network nn_output = model(frame) # vote for class with 25 consecutive frames if frame_count % 10 == 0: nn_output = softmax(nn_output) nn_output = nn_output.data.cpu().numpy() preds = nn_output.argsort()[0][-5:][::-1] pred_classes = [(idx_to_class[str(pred + 1)], nn_output[0, pred]) for pred in preds] red = (0, 0, 255) green = (0, 255, 0) blue = (255, 0, 0) white = (255, 255, 255) yellow = (0, 255, 255) cyan = (255, 255, 0) magenta = (255, 0, 255) thickness = 2 center_x = int(342 / 2.0) center_y = int(256 / 2.0) location = (center_x - 170, center_y + 80) fontScale = 1.5 font = cv2.FONT_HERSHEY_SIMPLEX y0, dy = 180, 20 value = 0 for i in range(5): y = y0 + i * dy if pred_classes[i][0] == label: value = pred_classes[i][1] # reset the process f.write(str(value) + "\n") nn_output = torch.tensor(np.zeros((1, 23)), dtype=torch.float32).cuda() # cv2.imwrite("temp/" + str(frame_count) + ".jpg", orig_frame) # Display the resulting frame and the classified action frame_count += 1 # When everything done, release the capture f.write("----\n") vs.release() f.close()
def main(): global args, prec_list prec_list = [] args = parser.parse_args() full_path = logging(args) print(args.modality + " network trained with the split " + str(args.split) + ".") # create model print("Building model ... ") exits_model, model = build_model(int(args.start_epoch), args.pretrain_weights) if not exits_model: return else: print("Model %s is loaded. " % (args.arch)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) cudnn.benchmark = True # Data transforming if args.modality == "rgb" or args.modality == "rgb2": is_color = True scale_ratios = [1.0, 0.875, 0.75, 0.66] clip_mean = [0.485, 0.456, 0.406] * args.new_length clip_std = [0.229, 0.224, 0.225] * args.new_length elif args.modality == "flow" or args.modality == "rhythm": is_color = False scale_ratios = [1.0, 0.875, 0.75] clip_mean = [0.5, 0.5] * args.new_length clip_std = [0.226, 0.226] * args.new_length else: print("No such modality. Only rgb and flow supported.") new_size = 299 if args.arch.find("inception_v3") > 0 else 224 normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) train_transform = video_transforms.Compose([ #video_transforms.Scale((256)), video_transforms.MultiScaleCrop((new_size, new_size), scale_ratios), video_transforms.RandomHorizontalFlip(), video_transforms.ToTensor(), normalize, ]) if args.es: val_transform = video_transforms.Compose([ # video_transforms.Scale((256)), video_transforms.CenterCrop((new_size)), video_transforms.ToTensor(), normalize, ]) modality_ = "rgb" if (args.modality == "rhythm" or args.modality[:3] == "rgb") else "flow" if args.modality == "rgb2": createNewDataset("train_split%d.txt", "new_train.txt", modality_) #createNewDataset("val_%s_split%d.txt", "new_val.txt",modality_) # data loading train_setting_file = "new_train.txt" if args.modality == "rgb2" else "train_split%d.txt" % ( args.split) train_split_file = os.path.join(args.settings, args.dataset, train_setting_file) if not os.path.exists( train_split_file): # or not os.path.exists(val_split_file): print( "No split file exists in %s directory. Preprocess the dataset first" % (args.settings)) extension = ".png" if args.dataset == "hmdb51" and args.modality == "rhythm" else ".jpg" direction_file = "direction.txt" if args.vr_approach == 3 else "direction_video.txt" direction_path = os.path.join(args.settings, args.dataset, direction_file) train_dataset = datasets.__dict__['dataset']( root=args.data, source=train_split_file, phase="train", modality=args.modality, is_color=is_color, new_length=args.new_length, new_width=args.new_width, new_height=args.new_height, video_transform=train_transform, approach_VR=args.vr_approach, extension=extension, direction_path=direction_path) if args.es: val_setting_file = "val_split%d.txt" % (args.split) val_split_file = os.path.join(args.settings, args.dataset, val_setting_file) if not os.path.exists(val_split_file): print( "No split file exists in %s directory. Preprocess the dataset first" % (args.settings)) val_dataset = datasets.__dict__['dataset']( root=args.data, source=val_split_file, phase="val", modality=args.modality, is_color=is_color, new_length=args.new_length, new_width=args.new_width, new_height=args.new_height, video_transform=val_transform, approach_VR=args.vr_approach, extension=extension, direction_path=direction_path) print('{} samples found, {} train samples and {} validation samples.'. format( len(val_dataset) + len(train_dataset), len(train_dataset), len(val_dataset))) else: print('{} train samples found.'.format(len(train_dataset))) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) if args.es: val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) early_stop = EarlyStopping(verbose=True, log_path=os.path.join( full_path, "early_stopping.json")) is_best = False for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.es: # evaluate on validation set losses = validate(val_loader, model, criterion) is_best = early_stop(losses.avg, epoch) if (epoch + 1) % args.save_freq == 0 or is_best: checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint_" + args.modality + "_split_" + str(args.split) + ".pth.tar") es_val = float('inf') if not args.es else early_stop.val_loss_min save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'val_loss_min': es_val }, is_best, checkpoint_name, os.path.join(full_path, "checkpoints")) prec_name = "%03d_%s" % (epoch + 1, "prec_split_" + str(args.split) + ".txt") save_precision(prec_name, os.path.join(full_path, "precision")) if args.es and early_stop.early_stop: break if not args.es: # Final model checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint_" + args.modality + "_split_" + str(args.split) + ".pth.tar") save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'val_loss_min': float('inf') }, True, checkpoint_name, os.path.join(full_path, "checkpoints"))
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args = parser.parse_args() set_logger(log_file=args.log_file, debug_mode=args.debug_mode) torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) cudnn.benchmark = True mean = [124 / 255, 117 / 255, 104 / 255] std = [1 / (.0167 * 255)] * 3 normalize = transforms.Normalize(mean=mean, std=std) train_loader = VideoIterTrain( dataset_path=args.dataset_path, annotation_path=args.annotation_path, clip_length=args.clip_length, frame_interval=args.train_frame_interval, video_transform=transforms.Compose([ transforms.Resize((256, 256)), transforms.RandomCrop((224, 224)), transforms.ToTensor(), normalize, ]), name='train', return_item_subpath=False, ) train_iter = torch.utils.data.DataLoader( train_loader, batch_size=args.batch_size, shuffle=False, num_workers=32, # 4, # change this part accordingly pin_memory=True) val_loader = VideoIterTrain( dataset_path=args.dataset_path, annotation_path=args.annotation_path_test, clip_length=args.clip_length, frame_interval=args.val_frame_interval, video_transform=transforms.Compose([ transforms.Resize((256, 256)), transforms.RandomCrop((224, 224)), transforms.ToTensor(), normalize, ]), name='val', return_item_subpath=False, ) val_iter = torch.utils.data.DataLoader( val_loader, batch_size=args.batch_size, shuffle=False, num_workers=32, # 4, # change this part accordingly pin_memory=True) network = C3D(pretrained=args.pretrained_3d) network.to(device) if not path.exists(features_dir): mkdir(features_dir) features_writer = FeaturesWriter() for i_batch, (data, target, sampled_idx, dirs, vid_names) in tqdm(enumerate(train_iter)): data = data.to(device) with torch.no_grad(): input_var = torch.autograd.Variable(data) outputs = network(input_var) for i, (dir, vid_name, start_frame) in enumerate( zip(dirs, vid_names, sampled_idx.cpu().numpy())): dir = path.join(features_dir, dir) features_writer.write(feature=outputs[i], video_name=vid_name, start_frame=start_frame, dir=dir) features_writer.dump() features_writer = FeaturesWriter() for i_batch, (data, target, sampled_idx, dirs, vid_names) in tqdm(enumerate(val_iter)): data = data.to(device) with torch.no_grad(): input_var = torch.autograd.Variable(data) outputs = network(input_var) for i, (dir, vid_name, start_frame) in enumerate( zip(dirs, vid_names, sampled_idx.cpu().numpy())): dir = path.join(features_dir, dir) features_writer.write(feature=outputs[i], video_name=vid_name, start_frame=start_frame, dir=dir) features_writer.dump()
def main(): global args, prec_list prec_list = [] args = parser.parse_args() full_path = logging(args) print("Network trained whith the split " + str(args.split) + ".") # create model print("Building model ... ") exits_model, model = build_model(int(args.start_epoch)) if not exits_model: return else: print("Model %s is loaded. " % (args.arch)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) cudnn.benchmark = True # Data transforming is_color = False scale_ratios = [1.0, 0.875, 0.75] clip_mean = [0.5] * args.n_images clip_std = [0.226] * args.n_images new_size = 299 if args.arch.find("inception_v3") > 0 else 224 normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) train_transform = video_transforms.Compose([ video_transforms.RandomHorizontalFlip(), video_transforms.ToTensor(), normalize, ]) if args.es: val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) # data loading train_setting_file = "train_split%d.txt" % (args.split) train_split_file = os.path.join(args.settings, args.dataset, train_setting_file) if not os.path.exists(train_split_file): print( "No split file exists in %s directory. Preprocess the dataset first" % (args.settings)) train_dataset = dataset.__dict__['dataset']( root=args.data, source=train_split_file, phase="train", is_color=is_color, n_images=args.n_images, new_width=args.new_width, new_height=args.new_height, video_transform=train_transform) if args.es: val_setting_file = "val_split%d.txt" % (args.split) val_split_file = os.path.join(args.settings, args.dataset, val_setting_file) if not os.path.exists(val_split_file): print( "No split file exists in %s directory. Preprocess the dataset first" % (args.settings)) val_dataset = dataset.__dict__['dataset']( root=args.data, source=val_split_file, phase="val", is_color=is_color, n_images=args.n_images, new_width=args.new_width, new_height=args.new_height, video_transform=val_transform) print('{} samples found, {} train samples and {} test samples.'.format( len(val_dataset) + len(train_dataset), len(train_dataset), len(val_dataset))) else: print('{} train samples found.'.format(len(train_dataset))) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) if args.es: val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) early_stop = EarlyStopping(verbose=True, log_path=os.path.join( full_path, "early_stopping.json")) is_best = False for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.es: # evaluate on validation set losses = validate(val_loader, model, criterion) is_best = early_stop(losses.avg, epoch) if (epoch + 1) % args.save_freq == 0 or is_best: checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint_rhythm_split_" + str(args.split) + ".pth.tar") es_val = float('inf') if not args.es else early_stop.val_loss_min save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'val_loss_min': es_val }, is_best, checkpoint_name, os.path.join(full_path, "checkpoints")) prec_name = "%03d_%s" % (epoch + 1, "prec_split_" + str(args.split) + ".txt") save_precision(prec_name, os.path.join(full_path, "precision")) if args.es and early_stop.early_stop: break if not args.es: # Final model checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint_rhythm_split_" + str(args.split) + ".pth.tar") save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'val_loss_min': float('inf') }, True, checkpoint_name, os.path.join(full_path, "checkpoints"))
def main(): global args, best_acc1 args = parser.parse_args() num_classes = args.num_classes start_epoch=0 writer = SummaryWriter(args.logdir) model = build_model(num_classes=num_classes, input_length=args.new_length) print(model) # create model print("Building model ... ") model = torch.nn.DataParallel(model) model.cuda() if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) print("Saving everything to directory %s." % (args.out_dir)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = ReduceLROnPlateau(optimizer, verbose=True, patience=4) # if resume set to True, load the model and continue training if args.resume or args.evaluate: if os.path.isfile(args.model_path): model, optimizer, start_epoch = load_checkpoint(model, optimizer, args.model_path) cudnn.benchmark = True is_color = True # scale_ratios = [1.0, 0.875, 0.75, 0.66] clip_mean = {'rgb': [0.485, 0.456, 0.406] * args.new_length, 'flow': [0.9432, 0.9359, 0.9511] *args.new_length, 'skeleton': [0.0071, 0.0078, 0.0079]*args.new_length} clip_std = {'rgb': [0.229, 0.224, 0.225] * args.new_length, 'flow': [0.0788, 0.0753, 0.0683] * args.new_length, 'skeleton': [0.0581, 0.0621, 0.0623] * args.new_length} normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) train_transform = video_transforms.Compose([ video_transforms.Resize((args.new_width, args.new_height)), video_transforms.ToTensor(), normalize, ]) val_transform = video_transforms.Compose([ video_transforms.Resize((args.new_width, args.new_height)), video_transforms.ToTensor(), normalize, ]) train_dataset = datasets.__dict__[args.dataset](root=args.data, source=args.train_split_file, phase="train", is_color=is_color, new_length=args.new_length, video_transform=train_transform) val_dataset = datasets.__dict__[args.dataset](root=args.data, source=args.test_split_file, phase="val", is_color=is_color, new_length=args.new_length, video_transform=val_transform, return_id=True) print('{} samples found, {} train samples and {} test samples.'.format(len(val_dataset)+len(train_dataset), len(train_dataset), len(val_dataset))) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, epoch=0, writer=writer, classes=val_dataset.classes) return for epoch in range(start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, writer) # evaluate on validation set acc1, loss = validate(val_loader, model, criterion, epoch, writer) scheduler.step(loss, epoch=epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint({ 'epoch': epoch + 1, 'arch': 'ThreeStreamTemporal', 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, is_best, 'last_checkpoint.pth.tar', args.out_dir) writer.close()
import numpy as np from PIL import Image from tsn_dataset import TSNDataSet from p3d_model import P3D199,get_optim_policies import video_transforms from tsn_models import TSN from torch.nn.utils import clip_grad_norm val_transform=video_transforms.Compose( [ video_transforms.Resize((182,242)), video_transforms.CenterCrop(160), video_transforms.ToTensor(), video_transforms.Normalize((0.485,0.456,0.406), (0.229,0.224,0.225))] ) val_loader=torch.utils.data.DataLoader( TSNDataSet("","tsntest_01.lst", num_segments=2, new_length=16, modality="RGB", image_tmpl="frame{:06d}.jpg", transform=val_transform, random_shift=False), batch_size=1, shuffle=False, num_workers=1, pin_memory=False )
def flow_test(param_model): model = param_model video_list = os.listdir(video) f = open(video + "flow_result.txt", 'w') for file in video_list: if file.endswith("mp4"): f.write(file + "\n") # file_data = OrderedDict() frame_count = 0 clip_mean = [0.5, 0.5] * 10 clip_std = [0.226, 0.226] * 10 normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) # config the transform to match the network's format transform = video_transforms.Compose([ # video_transforms.Scale((256)), video_transforms.CenterCrop((224)), video_transforms.ToTensor(), normalize, ]) # prepare the translation dictionary label-action data_handler = UCF101_splitter( os.getcwd() + '/datasets/ucf101_splits/', None) data_handler.get_action_index() class_to_idx = data_handler.action_label idx_to_class = {v: k for k, v in class_to_idx.items()} # Start looping on frames received from webcam vs = cv2.VideoCapture(video + file) softmax = torch.nn.Softmax() nn_output = torch.FloatTensor(2 * 10, 224, 224) count = 0 idx = 0 temp = '' x = [] sampled_list = [] while (vs.isOpened()): ret, image = vs.read() if ret is False: break else: image = cv2.resize(image, (342, 256), interpolation=cv2.INTER_LINEAR) x.append(temp) if count == 11: sampled_list = [] # input_var = torch.autograd.Variable(clip_input, volatile=True) temp = '' input_var = clip_input.view(1, 20, 224, 224).cuda() output = model(input_var) output = softmax(output) output = output.data.cpu().numpy() preds = output.argsort()[0][-5:][::-1] pred_classes = [(idx_to_class[str(pred + 1)], output[0, pred]) for pred in preds] value = 0 for i in range(5): if pred_classes[i][0] == label: value = pred_classes[i][1] temp += '{} - {:.2f}\n'.format( pred_classes[i][0], pred_classes[i][1]) f.write(str(value) + "\n") nn_output = torch.FloatTensor(2 * 10, 224, 224) count = 1 if count == 0: old_frame = image.copy() prev = cv2.cvtColor(old_frame, cv2.COLOR_BGR2GRAY) else: next = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) flow = cv2.calcOpticalFlowFarneback( prev, next, 1, 0.5, 3, 15, 3, 5, 1.2, 0) horz = cv2.normalize(flow[..., 0], None, 0, 255, cv2.NORM_MINMAX) vert = cv2.normalize(flow[..., 1], None, 0, 255, cv2.NORM_MINMAX) horz = horz.astype('uint8') vert = vert.astype('uint8') imgH = Image.fromarray(horz) imgV = Image.fromarray(vert) sampled_list.append(np.expand_dims(imgH, 2)) sampled_list.append(np.expand_dims(imgV, 2)) clip_input = np.concatenate(sampled_list, axis=2) clip_input = transform(clip_input) clip_input = clip_input.float().cuda(async=True) imgH.close() imgV.close() prev = next.copy() count += 1 idx += 1 f.write("----\n") # file_data[file] = flow_value # with open('flow.json', 'w', encoding="utf-8") as make_file: # json.dump(file_data, make_file, ensure_ascii=False, indent="\t") print(idx) vs.release() f.close()
def VideoSpatialPrediction(vid_name, net, num_categories, start_frame=0, num_frames=0, num_samples=25): if num_frames == 0: imglist = os.listdir(vid_name) duration = len(imglist) # print(duration) else: duration = num_frames clip_mean = [0.485, 0.456, 0.406] clip_std = [0.229, 0.224, 0.225] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) # selection step = int(math.floor((duration - 1) / (num_samples - 1))) dims = (256, 340, 3, num_samples) rgb = np.zeros(shape=dims, dtype=np.float64) rgb_flip = np.zeros(shape=dims, dtype=np.float64) for i in range(num_samples): img_file = os.path.join(vid_name, 'frame{0:06d}.jpg'.format(i * step + 1)) img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED) img = cv2.resize(img, dims[1::-1]) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) rgb[:, :, :, i] = img rgb_flip[:, :, :, i] = img[:, ::-1, :] # crop rgb_1 = rgb[:224, :224, :, :] rgb_2 = rgb[:224, -224:, :, :] rgb_3 = rgb[16:240, 60:284, :, :] rgb_4 = rgb[-224:, :224, :, :] rgb_5 = rgb[-224:, -224:, :, :] rgb_f_1 = rgb_flip[:224, :224, :, :] rgb_f_2 = rgb_flip[:224, -224:, :, :] rgb_f_3 = rgb_flip[16:240, 60:284, :, :] rgb_f_4 = rgb_flip[-224:, :224, :, :] rgb_f_5 = rgb_flip[-224:, -224:, :, :] rgb = np.concatenate((rgb_1, rgb_2, rgb_3, rgb_4, rgb_5, rgb_f_1, rgb_f_2, rgb_f_3, rgb_f_4, rgb_f_5), axis=3) _, _, _, c = rgb.shape rgb_list = [] for c_index in range(c): cur_img = rgb[:, :, :, c_index].squeeze() cur_img_tensor = val_transform(cur_img) rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0)) rgb_np = np.concatenate(rgb_list, axis=0) # print(rgb_np.shape) batch_size = 25 prediction = np.zeros((num_categories, rgb.shape[3])) num_batches = int(math.ceil(float(rgb.shape[3]) / batch_size)) for bb in range(num_batches): span = range(batch_size * bb, min(rgb.shape[3], batch_size * (bb + 1))) input_data = rgb_np[span, :, :, :] imgDataTensor = torch.from_numpy(input_data).type( torch.FloatTensor).cuda() imgDataVar = torch.autograd.Variable(imgDataTensor) output = net(imgDataVar) result = output.data.cpu().numpy() prediction[:, span] = np.transpose(result) return prediction
def VideoTemporalPrediction(vid_name, net, num_categories, start_frame=0, num_frames=0, num_samples=5, optical_flow_frames=25): if num_frames == 0: # print(vid_name) imglist = glob.glob(os.path.join(vid_name, '*flow_x*.jpg')) duration = len(imglist) else: duration = num_frames clip_mean = [0.5] * 20 clip_std = [0.226] * 20 normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) # selection step = int(math.floor((duration - optical_flow_frames + 1) / num_samples)) dims = (256, 340, optical_flow_frames * 2, num_samples) flow = np.zeros(shape=dims, dtype=np.float64) flow_flip = np.zeros(shape=dims, dtype=np.float64) for i in range(num_samples): for j in range(optical_flow_frames): flow_x_file = os.path.join( vid_name, 'flow_x_{0:04d}.jpg'.format(i * step + j + 1 + start_frame)) flow_y_file = os.path.join( vid_name, 'flow_y_{0:04d}.jpg'.format(i * step + j + 1 + start_frame)) img_x = cv2.imread(flow_x_file, cv2.IMREAD_GRAYSCALE) img_y = cv2.imread(flow_y_file, cv2.IMREAD_GRAYSCALE) img_x = cv2.resize(img_x, dims[1::-1]) img_y = cv2.resize(img_y, dims[1::-1]) flow[:, :, j * 2, i] = img_x flow[:, :, j * 2 + 1, i] = img_y flow_flip[:, :, j * 2, i] = 255 - img_x[:, ::-1] flow_flip[:, :, j * 2 + 1, i] = img_y[:, ::-1] # crop flow_1 = flow[:224, :224, :, :] flow_2 = flow[:224, -224:, :, :] flow_3 = flow[16:240, 60:284, :, :] flow_4 = flow[-224:, :224, :, :] flow_5 = flow[-224:, -224:, :, :] flow_f_1 = flow_flip[:224, :224, :, :] flow_f_2 = flow_flip[:224, -224:, :, :] flow_f_3 = flow_flip[16:240, 60:284, :, :] flow_f_4 = flow_flip[-224:, :224, :, :] flow_f_5 = flow_flip[-224:, -224:, :, :] flow = np.concatenate((flow_1, flow_2, flow_3, flow_4, flow_5, flow_f_1, flow_f_2, flow_f_3, flow_f_4, flow_f_5), axis=3) _, _, _, c = flow.shape flow_list = [] for c_index in range(c): cur_img = flow[:, :, :, c_index].squeeze() cur_img_tensor = val_transform(cur_img) flow_list.append(np.expand_dims(cur_img_tensor.numpy(), 0)) flow_np = np.concatenate(flow_list, axis=0) batch_size = 25 prediction = np.zeros((num_categories, flow.shape[3])) num_batches = int(math.ceil(float(flow.shape[3]) / batch_size)) for bb in range(num_batches): span = range(batch_size * bb, min(flow.shape[3], batch_size * (bb + 1))) input_data = flow_np[span, :, :, :] imgDataTensor = torch.from_numpy(input_data).type( torch.FloatTensor).cuda() imgDataVar = torch.autograd.Variable(imgDataTensor) output = net(imgDataVar) result = output.data.cpu().numpy() prediction[:, span] = np.transpose(result) return prediction
def VideoTemporalPrediction(mode, vid_name, target, net, num_categories, start_frame=0, num_frames=0, num_samples=25, optical_flow_frames=10, new_size=299, ext=".jpg"): gc = GradCAM(model=net) if num_frames == 0: imglist = os.listdir(vid_name) duration = len(imglist) else: duration = num_frames # selection step = int(math.floor((duration - optical_flow_frames + 1) / num_samples)) clip_mean = [0.5] * optical_flow_frames * 2 clip_std = [0.226] * optical_flow_frames * 2 normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) test_transform = video_transforms.Compose( [video_transforms.ToTensor(), normalize]) # inception = 320,360, resnet = 240, 320 width = 320 if new_size == 299 else 240 height = 360 if new_size == 299 else 320 deep = optical_flow_frames * 2 dims = (width, height, deep, num_samples) flow = np.zeros(shape=dims, dtype=np.float64) flow_flip = np.zeros(shape=dims, dtype=np.float64) for i in range(num_samples): for j in range(optical_flow_frames): flow_x_file = os.path.join( vid_name, mode + '_x_{0:05d}{1}'.format(i * step + j + 1 + start_frame, ext)) flow_y_file = os.path.join( vid_name, mode + '_y_{0:05d}{1}'.format(i * step + j + 1 + start_frame, ext)) img_x = cv2.imread(flow_x_file, cv2.IMREAD_GRAYSCALE) img_y = cv2.imread(flow_y_file, cv2.IMREAD_GRAYSCALE) img_x = cv2.resize(img_x, dims[1::-1]) img_y = cv2.resize(img_y, dims[1::-1]) flow[:, :, j * 2, i] = img_x flow[:, :, j * 2 + 1, i] = img_y flow_flip[:, :, j * 2, i] = 255 - img_x[:, ::-1] flow_flip[:, :, j * 2 + 1, i] = img_y[:, ::-1] # crop 299 = inception, 224 = resnet size = new_size corner = [(height - size) // 2, (width - size) // 2] flow_1 = flow[:size, :size, :, :] flow_2 = flow[:size, -size:, :, :] flow_3 = flow[corner[1]:corner[1] + size, corner[0]:corner[0] + size, :, :] flow_4 = flow[-size:, :size, :, :] flow_5 = flow[-size:, -size:, :, :] flow_f_1 = flow_flip[:size, :size, :, :] flow_f_2 = flow_flip[:size, -size:, :, :] flow_f_3 = flow_flip[corner[1]:corner[1] + size, corner[0]:corner[0] + size, :, :] flow_f_4 = flow_flip[-size:, :size, :, :] flow_f_5 = flow_flip[-size:, -size:, :, :] flow = np.concatenate((flow_1, flow_2, flow_3, flow_4, flow_5, flow_f_1, flow_f_2, flow_f_3, flow_f_4, flow_f_5), axis=3) _, _, _, c = flow.shape flow_list = [] for c_index in range(c): cur_img = flow[:, :, :, c_index].squeeze() cur_img_tensor = test_transform(cur_img) flow_list.append(np.expand_dims(cur_img_tensor.numpy(), 0)) flow_np = np.concatenate(flow_list, axis=0) prediction = np.zeros((num_categories, flow.shape[3])) index = 50 input_data = flow_np[index:index + 1, :, :, :] raw_image_x = flow[:, :, [0, 2, 4, 6, 8], index] raw_image_y = flow[:, :, [1, 3, 5, 7, 9], index] print(raw_image_x.shape, raw_image_y.shape) imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda() imgDataVar = torch.autograd.Variable(imgDataTensor) probs, ids = gc.forward(imgDataVar) ids_ = torch.LongTensor([[target]] * len(imgDataVar)).to( torch.device("cuda")) gc.backward(ids=ids_) regions = gc.generate(target_layer="Mixed_7c") save_gradcam(vid_name.split("/")[-1] + "_x.png", gcam=regions[0, 0], raw_image=flow[:, :, 4:5, index]) save_gradcam(vid_name.split("/")[-1] + "_y.png", gcam=regions[0, 0], raw_image=flow[:, :, 5:6, index]) return prediction
def main(): global args, best_prec1, model, writer, best_loss, length, width, height, input_size, scheduler args = parser.parse_args() training_continue = args.contine if '3D' in args.arch: if 'I3D' in args.arch or 'MFNET3D' in args.arch: if '112' in args.arch: scale = 0.5 else: scale = 1 else: if '224' in args.arch: scale = 1 else: scale = 0.5 elif 'r2plus1d' in args.arch: scale = 0.5 else: scale = 1 print('scale: %.1f' % (scale)) input_size = int(224 * scale) width = int(340 * scale) height = int(256 * scale) saveLocation = "./checkpoint/" + args.dataset + "_" + args.arch + "_split" + str( args.split) if not os.path.exists(saveLocation): os.makedirs(saveLocation) writer = SummaryWriter(saveLocation) # create model if args.evaluate: print("Building validation model ... ") model = build_model_validate() optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif training_continue: model, startEpoch, optimizer, best_prec1 = build_model_continue() for param_group in optimizer.param_groups: lr = param_group['lr'] #param_group['lr'] = lr print( "Continuing with best precision: %.3f and start epoch %d and lr: %f" % (best_prec1, startEpoch, lr)) else: print("Building model with ADAMW... ") model = build_model() optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) startEpoch = 0 if HALF: model.half() # convert to half precision for layer in model.modules(): if isinstance(layer, nn.BatchNorm2d): layer.float() print("Model %s is loaded. " % (args.arch)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() criterion2 = nn.MSELoss().cuda() scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, verbose=True) print("Saving everything to directory %s." % (saveLocation)) if args.dataset == 'ucf101': dataset = './datasets/ucf101_frames' elif args.dataset == 'hmdb51': dataset = './datasets/hmdb51_frames' elif args.dataset == 'smtV2': dataset = './datasets/smtV2_frames' elif args.dataset == 'window': dataset = './datasets/window_frames' elif args.dataset == 'haa500_basketball': dataset = './datasets/haa500_basketball_frames' else: print("No convenient dataset entered, exiting....") return 0 cudnn.benchmark = True modality = args.arch.split('_')[0] if "3D" in args.arch or 'tsm' in args.arch or 'slowfast' in args.arch or 'r2plus1d' in args.arch: if '64f' in args.arch: length = 64 elif '32f' in args.arch: length = 32 else: length = 16 else: length = 1 # Data transforming if modality == "rgb" or modality == "pose": is_color = True scale_ratios = [1.0, 0.875, 0.75, 0.66] if 'I3D' in args.arch: if 'resnet' in args.arch: clip_mean = [0.45, 0.45, 0.45] * args.num_seg * length clip_std = [0.225, 0.225, 0.225] * args.num_seg * length else: clip_mean = [0.5, 0.5, 0.5] * args.num_seg * length clip_std = [0.5, 0.5, 0.5] * args.num_seg * length #clip_std = [0.25, 0.25, 0.25] * args.num_seg * length elif 'MFNET3D' in args.arch: clip_mean = [0.48627451, 0.45882353, 0.40784314 ] * args.num_seg * length clip_std = [0.234, 0.234, 0.234] * args.num_seg * length elif "3D" in args.arch: clip_mean = [114.7748, 107.7354, 99.4750] * args.num_seg * length clip_std = [1, 1, 1] * args.num_seg * length elif "r2plus1d" in args.arch: clip_mean = [0.43216, 0.394666, 0.37645] * args.num_seg * length clip_std = [0.22803, 0.22145, 0.216989] * args.num_seg * length elif "rep_flow" in args.arch: clip_mean = [0.5, 0.5, 0.5] * args.num_seg * length clip_std = [0.5, 0.5, 0.5] * args.num_seg * length elif "slowfast" in args.arch: clip_mean = [0.45, 0.45, 0.45] * args.num_seg * length clip_std = [0.225, 0.225, 0.225] * args.num_seg * length else: clip_mean = [0.485, 0.456, 0.406] * args.num_seg * length clip_std = [0.229, 0.224, 0.225] * args.num_seg * length elif modality == "pose": is_color = True scale_ratios = [1.0, 0.875, 0.75, 0.66] clip_mean = [0.485, 0.456, 0.406] * args.num_seg clip_std = [0.229, 0.224, 0.225] * args.num_seg elif modality == "flow": is_color = False scale_ratios = [1.0, 0.875, 0.75, 0.66] if 'I3D' in args.arch: clip_mean = [0.5, 0.5] * args.num_seg * length clip_std = [0.5, 0.5] * args.num_seg * length elif "3D" in args.arch: clip_mean = [127.5, 127.5] * args.num_seg * length clip_std = [1, 1] * args.num_seg * length else: clip_mean = [0.5, 0.5] * args.num_seg * length clip_std = [0.226, 0.226] * args.num_seg * length elif modality == "both": is_color = True scale_ratios = [1.0, 0.875, 0.75, 0.66] clip_mean = [0.485, 0.456, 0.406, 0.5, 0.5] * args.num_seg * length clip_std = [0.229, 0.224, 0.225, 0.226, 0.226] * args.num_seg * length else: print("No such modality. Only rgb and flow supported.") normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) if "3D" in args.arch and not ('I3D' in args.arch): train_transform = video_transforms.Compose([ video_transforms.MultiScaleCrop((input_size, input_size), scale_ratios), video_transforms.RandomHorizontalFlip(), video_transforms.ToTensor2(), normalize, ]) val_transform = video_transforms.Compose([ video_transforms.CenterCrop((input_size)), video_transforms.ToTensor2(), normalize, ]) else: train_transform = video_transforms.Compose([ video_transforms.MultiScaleCrop((input_size, input_size), scale_ratios), video_transforms.RandomHorizontalFlip(), video_transforms.ToTensor(), normalize, ]) val_transform = video_transforms.Compose([ video_transforms.CenterCrop((input_size)), video_transforms.ToTensor(), normalize, ]) # data loading train_setting_file = "train_%s_split%d.txt" % (modality, args.split) train_split_file = os.path.join(args.settings, args.dataset, train_setting_file) val_setting_file = "val_%s_split%d.txt" % (modality, args.split) val_split_file = os.path.join(args.settings, args.dataset, val_setting_file) if not os.path.exists(train_split_file) or not os.path.exists( val_split_file): print( "No split file exists in %s directory. Preprocess the dataset first" % (args.settings)) train_dataset = datasets.__dict__[args.dataset]( root=dataset, source=train_split_file, phase="train", modality=modality, is_color=is_color, new_length=length, new_width=width, new_height=height, video_transform=train_transform, num_segments=args.num_seg) val_dataset = datasets.__dict__[args.dataset]( root=dataset, source=val_split_file, phase="val", modality=modality, is_color=is_color, new_length=length, new_width=width, new_height=height, video_transform=val_transform, num_segments=args.num_seg) print('{} samples found, {} train samples and {} test samples.'.format( len(val_dataset) + len(train_dataset), len(train_dataset), len(val_dataset))) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: prec1, prec3, _ = validate(val_loader, model, criterion, criterion2, modality) return for epoch in range(startEpoch, args.epochs): # if learning_rate_index > max_learning_rate_decay_count: # break # adjust_learning_rate(optimizer, epoch) train(train_loader, model, criterion, criterion2, optimizer, epoch, modality) # evaluate on validation set prec1 = 0.0 lossClassification = 0 if (epoch + 1) % args.save_freq == 0: prec1, prec3, lossClassification = validate( val_loader, model, criterion, criterion2, modality) writer.add_scalar('data/top1_validation', prec1, epoch) writer.add_scalar('data/top3_validation', prec3, epoch) writer.add_scalar('data/classification_loss_validation', lossClassification, epoch) scheduler.step(lossClassification) # remember best prec@1 and save checkpoint is_best = prec1 >= best_prec1 best_prec1 = max(prec1, best_prec1) # best_in_existing_learning_rate = max(prec1, best_in_existing_learning_rate) # # if best_in_existing_learning_rate > prec1 + 1: # learning_rate_index = learning_rate_index # best_in_existing_learning_rate = 0 if (epoch + 1) % args.save_freq == 0: checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint.pth.tar") if is_best: print("Model works well") save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint_name, saveLocation) checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint.pth.tar") save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint_name, saveLocation) writer.export_scalars_to_json("./all_scalars.json") writer.close()
def main(): global args, best_prec1 args = parser.parse_args() # create model print("Building model ... ") model = build_model() if torch.cuda.is_available(): model = torch.nn.DataParallel(model) print("Model %s is loaded. " % (args.arch)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if not os.path.exists(args.resume): os.makedirs(args.resume) print("Saving everything to directory %s." % (args.resume)) cudnn.benchmark = True # Data transforming if args.modality == "rgb": is_color = True scale_ratios = [1.0, 0.875, 0.75, 0.66] clip_mean = [0.485, 0.456, 0.406] * args.new_length clip_std = [0.229, 0.224, 0.225] * args.new_length elif args.modality == "flow": is_color = False scale_ratios = [1.0, 0.875, 0.75] clip_mean = [0.5, 0.5] * args.new_length clip_std = [0.226, 0.226] * args.new_length else: print("No such modality. Only rgb and flow supported.") normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) train_transform = video_transforms.Compose([ # video_transforms.Scale((256)), video_transforms.MultiScaleCrop((224, 224), scale_ratios), video_transforms.RandomHorizontalFlip(), video_transforms.ToTensor(), normalize, ]) val_transform = video_transforms.Compose([ # video_transforms.Scale((256)), video_transforms.CenterCrop((224)), video_transforms.ToTensor(), normalize, ]) # data loading # train_setting_file = "train_%s_split%d.txt" % (args.modality, args.split) # train_split_file = os.path.join(args.settings, args.dataset, train_setting_file) # val_setting_file = "val_%s_split%d.txt" % (args.modality, args.split) # val_split_file = os.path.join(args.settings, args.dataset, val_setting_file) # if not os.path.exists(train_split_file) or not os.path.exists(val_split_file): # print("No split file exists in %s directory. Preprocess the dataset first" % (args.settings)) train_split_file = './datasets/settings/train_set_detail.csv' val_split_file = './datasets/settings/val_set_detail.csv' train_dataset = datasets.__dict__[args.dataset]( root=args.data, #neet to change source=train_split_file, phase="train", modality=args.modality, is_color=is_color, new_length=args.new_length, new_width=args.new_width, new_height=args.new_height, video_transform=train_transform, name_pattern="frame%06d.jpg") # frame000001 val_dataset = datasets.__dict__[args.dataset]( root=args.data, source=val_split_file, phase="val", modality=args.modality, is_color=is_color, new_length=args.new_length, new_width=args.new_width, new_height=args.new_height, video_transform=val_transform, name_pattern="frame%06d.jpg") print('{} samples found, {} train samples and {} test samples.'.format( len(val_dataset) + len(train_dataset), len(train_dataset), len(val_dataset))) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return model_path = '/home/thl/Desktop/challeng/checkpoints/Mulity_100step_900epoch_batch80/model_best.pth.tar' params = torch.load(model_path) model.load_state_dict(params['state_dict']) for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = 0.0 if (epoch + 1) % args.save_freq == 0: prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if (epoch + 1) % args.save_freq == 0: checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint.pth.tar") save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint_name, args.resume)
def main(): global args, best_prec1 args = parser.parse_args() print(args.modality + " network trained whith the split " + str(args.split) + ".") # create model print("Building model ... ") exits_model, model = build_model(int(args.start_epoch)) if not exits_model: return else: print("Model %s is loaded. " % (args.arch)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # create file where we allocate the models by each args.save_freq epochs if not os.path.exists(args.resume): os.makedirs(args.resume) print("Saving everything to directory %s." % (args.resume)) cudnn.benchmark = True # Data transforming if args.modality == "rgb" or args.modality == "rhythm" or args.modality == "history": is_color = True scale_ratios = [1.0, 0.875, 0.75, 0.66] clip_mean = [0.485, 0.456, 0.406] * args.new_length clip_std = [0.299, 0.224, 0.225] * args.new_length elif args.modality == "flow": is_color = False scale_ratios = [1.0, 0.875, 0.75] clip_mean = [0.5, 0.5] * args.new_length clip_std = [0.226, 0.226] * args.new_length else: print("No such modality. Only rgb and flow supported.") new_size = 299 if args.arch == 'rgb_inception_v3' else 224 normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) train_transform = video_transforms.Compose([ #video_transforms.Scale((256)), video_transforms.MultiScaleCrop((new_size, new_size), scale_ratios), video_transforms.RandomHorizontalFlip(), video_transforms.ToTensor(), normalize, ]) val_transform = video_transforms.Compose([ # video_transforms.Scale((256)), video_transforms.CenterCrop((new_size)), video_transforms.ToTensor(), normalize, ]) #createNewDataset("train_%s_split%d.txt" , "new_train.txt") #createNewDataset("val_%s_split%d.txt", "new_test.txt") # data loading #train_setting_file = 'new_train.txt' modality_ = "rgb" if (args.modality == "rhythm" or args.modality == "history") else args.modality train_setting_file = "train_%s_split%d.txt" % (modality_, args.split) train_split_file = os.path.join(args.settings, args.dataset, train_setting_file) #val_setting_file = 'new_test.txt' val_setting_file = "val_%s_split%d.txt" % (modality_, args.split) val_split_file = os.path.join(args.settings, args.dataset, val_setting_file) if not os.path.exists(train_split_file) or not os.path.exists( val_split_file): print( "No split file exists in %s directory. Preprocess the dataset first" % (args.settings)) train_dataset = datasets.__dict__['dataset']( root=args.data, source=train_split_file, phase="train", modality=args.modality, is_color=is_color, new_length=args.new_length, new_width=args.new_width, new_height=args.new_height, video_transform=train_transform) val_dataset = datasets.__dict__['dataset'](root=args.data, source=val_split_file, phase="val", modality=args.modality, is_color=is_color, new_length=args.new_length, new_width=args.new_width, new_height=args.new_height, video_transform=val_transform) print('{} samples found, {} train samples and {} test samples.'.format( len(val_dataset) + len(train_dataset), len(train_dataset), len(val_dataset))) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = 0.0 if (epoch + 1) % args.save_freq == 0: prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if (epoch + 1) % args.save_freq == 0: checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint_" + args.modality + "_split_" + str(args.split) + ".pth.tar") save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint_name, args.resume)
def VideoSpatialPrediction( vid_name, net, num_categories, num_frames=0, ext_batch_sz=100, int_batch_sz=5, new_size = 299 ): if num_frames == 0: imglist = os.listdir(vid_name) duration = len(imglist) else: duration = num_frames clip_mean = [0.485, 0.456, 0.406] clip_std = [0.229, 0.224, 0.225] normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) val_transform = video_transforms.Compose([ video_transforms.ToTensor(), normalize, ]) deep = 3 # inception = 320,360, resnet = 240, 320 width = 320 if new_size==299 else 240 height = 360 if new_size==299 else 320 predictions = [] for i in range(len(num_categories)): predictions.append(np.zeros((num_categories[i],num_frames*10))) #control memory (RAM) usage num_ext_batch = int(math.ceil(float(num_frames)/ext_batch_sz)) for i in range(num_ext_batch): start = i*ext_batch_sz end = min(start+ext_batch_sz, num_frames) dims = (width,height,deep,end-start) rgb = np.zeros(shape=dims, dtype=np.float64) rgb_flip = np.zeros(shape=dims, dtype=np.float64) for j in range(end-start): img_file = os.path.join(vid_name, 'img_{0:05d}.jpg'.format(j+start+1)) img = cv2.imread(img_file, cv2.IMREAD_UNCHANGED) img = cv2.resize(img, dims[1::-1]) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) rgb[:,:,:,j] = img rgb_flip[:,:,:,j] = img[:,::-1,:] # crop 299 = inception, 224 = resnet size = new_size corner = [(height-size)//2, (width-size)//2] rgb_1 = rgb[:size, :size, :,:] rgb_2 = rgb[:size, -size:, :,:] rgb_3 = rgb[corner[1]:corner[1]+size, corner[0]:corner[0]+size, :,:] rgb_4 = rgb[-size:, :size, :,:] rgb_5 = rgb[-size:, -size:, :,:] rgb_f_1 = rgb_flip[:size, :size, :,:] rgb_f_2 = rgb_flip[:size, -size:, :,:] rgb_f_3 = rgb_flip[corner[1]:corner[1]+size, corner[0]:corner[0]+size, :,:] rgb_f_4 = rgb_flip[-size:, :size, :,:] rgb_f_5 = rgb_flip[-size:, -size:, :,:] rgb = np.concatenate((rgb_1,rgb_2,rgb_3,rgb_4,rgb_5,rgb_f_1,rgb_f_2,rgb_f_3,rgb_f_4,rgb_f_5), axis=3) rgb_1, rgb_2, rgb_3, rgb_4, rgb_5 = [],[],[],[],[] rgb_f_1, rgb_f_2, rgb_f_3, rgb_f_4, rgb_f_5 = [],[],[],[],[] rgb_flip = [] _, _, _, c = rgb.shape rgb_list = [] for c_index in range(c): cur_img = rgb[:,:,:,c_index] cur_img_tensor = val_transform(cur_img) rgb_list.append(np.expand_dims(cur_img_tensor.numpy(), 0)) rgb_shape = rgb.shape rgb = [] rgb_np = np.concatenate(rgb_list,axis=0) #control memory (GPU) usage num_int_batches = int(math.ceil(float(rgb_shape[3])/int_batch_sz)) rgb_list = [] for bb in range(num_int_batches): span = range(int_batch_sz*bb, min(rgb_shape[3],int_batch_sz*(bb+1))) input_data = rgb_np[span,:,:,:] imgDataTensor = torch.from_numpy(input_data).type(torch.FloatTensor).cuda() imgDataVar = torch.autograd.Variable(imgDataTensor) output = net(imgDataVar) for ii in range(len(output)): output_ = output[ii].reshape(-1, num_categories[ii]) result = output_.data.cpu().numpy() pos = [ x%(end-start) + start + int(x/(end-start))*num_frames for x in span ] predictions[ii][:, pos] = np.transpose(result) rgb_np = [] result = [] for ii in range(len(predictions)): result.append(np.split(predictions[ii],10,axis=1)) return result
def main(): global args, best_prec1 args = parser.parse_args() # create model print("Building model ... ") model = build_model() print("Model %s is loaded. " % (args.modality + "_" + args.arch)) if not os.path.exists(args.resume): os.makedirs(args.resume) print("Saving everything to directory %s." % (args.resume)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) cudnn.benchmark = True # Data transforming clip_mean = [0.485, 0.456, 0.406] * args.new_length clip_std = [0.229, 0.224, 0.225] * args.new_length normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) if args.modality == "rgb": scale_ratios = [1.0, 0.875, 0.75, 0.66] elif args.modality == "flow": scale_ratios = [1.0, 0.875, 0.75] else: print("No such modality. Only rgb and flow supported.") train_transform = video_transforms.Compose([ video_transforms.Scale((256)), video_transforms.MultiScaleCrop((224, 224), scale_ratios), video_transforms.RandomHorizontalFlip(), video_transforms.ToTensor(), normalize, ]) val_transform = video_transforms.Compose([ video_transforms.Scale((256)), video_transforms.CenterCrop((224)), video_transforms.ToTensor(), normalize, ]) # data loading train_setting_file = "train_%s_split%d.txt" % (args.modality, args.split) train_split_file = os.path.join(args.settings, args.dataset, train_setting_file) val_setting_file = "val_%s_split%d.txt" % (args.modality, args.split) val_split_file = os.path.join(args.settings, args.dataset, val_setting_file) if not os.path.exists(train_split_file) or not os.path.exists( val_split_file): print( "No split file exists in %s directory. Preprocess the dataset first" % (args.settings)) train_dataset = datasets.__dict__[args.dataset]( args.data, train_split_file, "train", args.new_length, video_transform=train_transform) val_dataset = datasets.__dict__[args.dataset]( args.data, val_split_file, "val", args.new_length, video_transform=val_transform) print('{} samples found, {} train samples and {} test samples.'.format( len(val_dataset) + len(train_dataset), len(train_dataset), len(val_dataset))) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if (epoch + 1) % args.save_freq == 0: checkpoint_name = "%03d_%s" % (epoch + 1, "checkpoint.pth.tar") save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint_name, args.resume)
def extract_from_three_stream(args): model = main_three_stream.build_model(num_classes=args.num_classes, input_length=args.new_length) # create model print("Building model ... ") model = torch.nn.DataParallel(model) model = model.to(args.device) # define loss function (criterion) and optimizer if os.path.isfile(args.vision_model_path): model, _, start_epoch = main_three_stream.load_checkpoint( model, None, args.vision_model_path) is_color = True clip_mean = { 'rgb': [0.485, 0.456, 0.406] * args.new_length, 'flow': [0.9432, 0.9359, 0.9511] * args.new_length, 'skeleton': [0.0071, 0.0078, 0.0079] * args.new_length } clip_std = { 'rgb': [0.229, 0.224, 0.225] * args.new_length, 'flow': [0.0788, 0.0753, 0.0683] * args.new_length, 'skeleton': [0.0581, 0.0621, 0.0623] * args.new_length } normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) train_transform = video_transforms.Compose([ video_transforms.Resize((args.new_width, args.new_height)), video_transforms.ToTensor(), normalize, ]) val_transform = video_transforms.Compose([ video_transforms.Resize((args.new_width, args.new_height)), video_transforms.ToTensor(), normalize, ]) train_dataset = datasets.__dict__[args.dataset]( root=args.data, source=args.train_split_file, phase="train", is_color=is_color, new_length=args.new_length, video_transform=train_transform, return_id=True) val_dataset = datasets.__dict__[args.dataset]( root=args.data, source=args.test_split_file, phase="val", is_color=is_color, new_length=args.new_length, video_transform=val_transform, return_id=True) print('{} samples found, {} train samples and {} test samples.'.format( len(val_dataset) + len(train_dataset), len(train_dataset), len(val_dataset))) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=args.workers, pin_memory=True) print("Extracting train visual representations") outputs_clip_train = infer_three_stream(train_loader, model, classes=val_dataset.classes) pickle.dump(outputs_clip_train, open(args.visual_representations_train, 'wb')) print("Extracting validation visual representations") outputs_clip_val = infer_three_stream(val_loader, model, classes=val_dataset.classes) pickle.dump(outputs_clip_val, open(args.visual_representations_val, 'wb')) return outputs_clip_train, outputs_clip_val