def fine_tune_autoencoder(self, inputs, model, args): model = copy.deepcopy(model) model.train() params = model.parameters() #lr = 1e-4 #warmup = 10 #optimizer = torch.optim.SGD(params, lr=lr, momentum=args.momentum, weight_decay=0) optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=0) criteria = AutoencoderCriterion(args) tol = 1e-2 loss = torch.Tensor([999]) timer = Timer() try: with torch.enable_grad(): num_iter = 0 while loss > tol: #if num_iter > warmup: # lr = 1e-3 optimizer.zero_grad() x_hat, code, x = model(inputs, None) _, loss, _ = criteria(x_hat, code, x, None, None) loss.backward() optimizer.step() num_iter += 1 timer.tic() if num_iter % args.print_freq == 0: print( ' Iter: [{0}]\t' 'Time {timer.val:.3f} ({timer.avg:.3f}) Loss: {1}'. format(num_iter, loss, timer=timer)) except KeyboardInterrupt as e: print(e) return model, x_hat
def get_item(self, index, shift=None): ims, tars, meta = [], [], {} meta['do_not_collate'] = True fps = 24 n = self.data['datas'][index]['n'] if shift is None: shift = np.random.randint(n - self.train_gap - 2) else: shift = int(shift * (n - self.train_gap - 2)) resize = transforms.Resize(int(256. / 224 * self.input_size)) normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) spacing = np.arange(shift, shift + self.train_gap) for loc in spacing: ii = int(np.floor(loc)) path = '{}{:06d}.jpg'.format(self.data['datas'][index]['base'], ii + 1) try: # ============ Temp =================== timer = Timer() img = default_loader(path) # ============ Temp =================== load_img_cost = timer.thetime() - timer.end timer.tic() print( 'Load image from disk: {0:.3f} sec'.format(load_img_cost)) except Exception as e: print('failed to load image {}'.format(path)) print(e) raise img = resize(img) img = transforms.ToTensor()(img) # ============ Temp =================== # totensor_cost = timer.thetime() - timer.end # timer.tic() # print('From PIL to tensor: {0:.3f} sec'.format(totensor_cost)) #img = 2*img - 1 img = normalize(img) ims.append(img) target = torch.IntTensor(self.num_classes).zero_() for x in self.data['datas'][index]['labels']: if x['start'] < ii / float(fps) < x['end']: target[self.cls2int(x['class'])] = 1 tars.append(target) meta['id'] = self.data['datas'][index]['id'] meta['time'] = shift img = torch.stack(ims).permute(0, 2, 3, 1).numpy() target = torch.stack(tars) if self.transform is not None: img = self.transform(img) # ============ Temp =================== # transform_cost = timer.thetime() - timer.end # timer.tic() # print('Image transform per mini-batch: {0:.3f} sec'.format(transform_cost)) if self.target_transform is not None: target = self.target_transform(target) # batch will be b x n x h x w x c # target will be b x n x nc return img, target, meta
def stabilize_all(self, loader, model, epoch, args): timer = Timer() for i, (inputs, target, meta) in enumerate(loader): if i >= self.num_videos: break if not args.cpu: inputs = inputs.cuda() target = target.cuda(async=True) original = inputs.detach().clone() reconstructed = model(inputs, None)[0] specific_model, fine_tuned = self.fine_tune_autoencoder( inputs, model, args) #fine_tuned = specific_model(inputs, None)[0] # prepare videos original = original[0] fine_tuned = fine_tuned[0] reconstructed = reconstructed[0] original *= torch.Tensor([0.229, 0.224, 0.225])[None, None, None, :].to(original.device) original += torch.Tensor([0.485, 0.456, 0.406])[None, None, None, :].to(original.device) fine_tuned *= torch.Tensor([0.229, 0.224, 0.225])[None, None, None, :].to(original.device) fine_tuned += torch.Tensor([0.485, 0.456, 0.406])[None, None, None, :].to(original.device) reconstructed *= torch.Tensor([0.229, 0.224, 0.225])[None, None, None, :].to(original.device) reconstructed += torch.Tensor([0.485, 0.456, 0.406])[None, None, None, :].to(original.device) # save video name = '{}_{}'.format(meta[0]['id'], meta[0]['time']) ffmpeg_video_writer(original.cpu(), '{}/{}_original.mp4'.format(args.cache, name)) ffmpeg_video_writer(fine_tuned.cpu(), '{}/{}_finetuned.mp4'.format(args.cache, name)) ffmpeg_video_writer( reconstructed.cpu(), '{}/{}_reconstructed.mp4'.format(args.cache, name)) combined = torch.cat( (original.cpu(), reconstructed.cpu(), fine_tuned.cpu()), 2) ffmpeg_video_writer(combined, '{}/{}_combined.mp4'.format(args.cache, name)) timer.tic() print('Autoencoder: [{0}/{1}]\t' 'Time {timer.val:.3f} ({timer.avg:.3f})'.format( i, self.num_videos, timer=timer)) return {}
def train(loader, model, optimizer, epoch, args): timer = Timer() data_time = AverageMeter() loss_meter = AverageMeter() ce_loss_meter = AverageMeter() cur_lr = adjust_learning_rate(args.lr_decay_rate, optimizer, epoch) model.train() optimizer.zero_grad() ce_loss_criterion = nn.CrossEntropyLoss() for i, (input, meta) in tqdm(enumerate(loader), desc="Train Epoch"): if args.debug and i >= debug_short_train_num: break data_time.update(timer.thetime() - timer.end) _batch_size = len(meta) target = [] for _ in range(_batch_size): target.extend(meta[_]["labels"]) target = torch.from_numpy(np.array(target)) input = input.view( _batch_size * 3, input.shape[2], input.shape[3], input.shape[4], input.shape[5], ) metric_feat, output = model(input) ce_loss = ce_loss_criterion(output.cuda(), target.long().cuda()) loss = ce_loss loss.backward() loss_meter.update(loss.item()) ce_loss_meter.update(ce_loss.item()) if i % args.accum_grad == args.accum_grad - 1: optimizer.step() optimizer.zero_grad() if i % args.print_freq == 0 and i > 0: logger.info("[{0}][{1}/{2}]\t" "Dataload_Time={data_time.avg:.3f}\t" "Loss={loss.avg:.4f}\t" "CELoss={ce_loss.avg:.4f}\t" "LR={cur_lr:.7f}\t" "bestAP={ap:.3f}".format( epoch, i, len(loader), data_time=data_time, loss=loss_meter, ce_loss=ce_loss_meter, ap=args.best_score, cur_lr=cur_lr, )) loss_meter.reset() ce_loss_meter.reset()
def validate_video(self, loader, model, criterion, epoch, args): """ Run video-level validation on the test set """ timer = Timer() ids, outputs = [], [] for i, meta in enumerate(loader.dataset.data['datas']): ids.append(meta['id']) metrics = [m() for m in self.metrics] # switch to evaluate mode model.eval() criterion.eval() for i, (input, target, meta) in enumerate(loader): if not args.cpu: input = input.cuda() target = target.cuda(async=True) # split batch into smaller chunks if args.video_batch_size == -1: output = model(input, meta) else: output_chunks = [] for chunk in input.split(args.video_batch_size): output_chunks.append(model(chunk, meta)) output = gather(output_chunks, input.device) if type(output) != tuple: output = (output, ) scores, loss, score_target = criterion(*(output + (target, meta)), synchronous=True) for m in metrics: m.update(scores, score_target) # store predictions scores_video = scores.max(dim=0)[0] outputs.append(scores_video.cpu()) # ids.append(meta['id'][0]) timer.tic() if i % args.print_freq == 0: print('[{name}] {task}: [{0}/{1}]\t' 'Time {timer.val:.3f} ({timer.avg:.3f})\t' '{metrics}'.format(i, len(loader), timer=timer, name=args.name, task=self.name, metrics=' \t'.join( str(m) for m in metrics))) del loss, output, target # make sure we don't hold on to the graph submission_file(ids, outputs, '{}/epoch_{:03d}.txt'.format(args.cache, epoch + 1)) metrics = dict(m.compute() for m in metrics) metrics = dict((self.name + '_' + k, v) for k, v in metrics.items()) print(metrics) return metrics
def read_multi_images(target_dir): timer = Timer() image_num = 0 for f in listdir(target_dir): file_full_path = join(target_dir, f) if isfile(file_full_path): read_one_iamge(file_full_path) image_num += 1 time_cost = timer.thetime() - timer.end print('Load images from disk: {0:.3f} sec'.format(time_cost)) return time_cost, image_num
def alignment(self, loader, model, epoch, args, task=best_one_sec_moment): timer = Timer() abssec = MedianMeter() abssec0 = MedianMeter() randsec = MedianMeter() model = ActorObserverFC7Wrapper(model, args) model = set_distributed_backend(model, args) # switch to evaluate mode model.eval() def fc7_generator(): for i, (inputs, target, meta) in enumerate(loader): if not args.cpu: target = target.cuda(async=True) first_fc7, third_fc7, w_x, w_y = model(*inputs) timer.tic() if i % args.print_freq == 0: print('Alignment: [{0}/{1}]\t' 'Time {timer.val:.3f} ({timer.avg:.3f})'.format( i, len(loader), timer=timer)) for vid, o1, o2 in zip(meta['id'], first_fc7, third_fc7): yield vid, (o1.cpu().numpy(), o2.cpu().numpy()) for key, grp in groupby(fc7_generator(), key=lambda x: x[0]): print('processing id: {}'.format(key)) _, mat = fc7list2mat(grp) _, _, _, j, gt = task(mat, winsize=3) _, _, _, j0, gt0 = task(mat, winsize=0) _, _, _, jr, gtr = task(np.random.randn(*mat.shape), winsize=3) abssec.update(abs(j - gt)) abssec0.update(abs(j0 - gt0)) randsec.update(abs(jr - gtr)) print(self.name) print(' abs3: {abs3.val:.3f} ({abs3.avg:.3f}) [{abs3.med:.3f}]' ' abs0: {abs0.val:.3f} ({abs0.avg:.3f}) [{abs0.med:.3f}]' '\n' ' absr: {absr.val:.3f} ({absr.avg:.3f}) [{absr.med:.3f}]'. format(abs3=abssec, abs0=abssec0, absr=randsec)) scores = { self.name + '_1sec': abssec0.med, self.name + '_1sec_smooth': abssec.med, self.name + '_1sec_random': randsec.med } return scores
def train(self, loader, model, criterion, optimizer, epoch, metrics, args, validate=False): timer = Timer() data_time = AverageMeter() losses = AverageMeter() metrics = [m() for m in metrics] if validate: # switch to evaluate mode model.eval() criterion.eval() iter_size = args.val_size setting = 'Validate Epoch' else: # switch to train mode adjust_learning_rate(args.lr, args.lr_decay_rate, optimizer, epoch) model.train() criterion.train() optimizer.zero_grad() iter_size = args.train_size setting = 'Train Epoch' for i, (input, target, meta) in enumerate(part(loader, iter_size)): if args.synchronous: assert meta['id'][0] == meta['id'][1], "dataset not synced" data_time.update(timer.thetime() - timer.end) if not args.cpu: target = target.cuda(non_blocking=True) output = model(input, meta) if type(output) != tuple: output = (output,) scores, loss, score_target = criterion(*(output + (target, meta))) losses.update(loss.item()) with torch.no_grad(): for m in metrics: m.update(scores, score_target) if not validate: loss.backward() if i % args.accum_grad == args.accum_grad-1: print('updating parameters') optimizer.step() optimizer.zero_grad() timer.tic() if i % args.print_freq == 0: print('[{name}] {setting}: [{0}][{1}/{2}({3})]\t' 'Time {timer.val:.3f} ({timer.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' '{metrics}'.format( epoch, i, int(len(loader)*iter_size), len(loader), name=args.name, setting=setting, timer=timer, data_time=data_time, loss=losses, metrics=' \t'.join(str(m) for m in metrics))) del loss, output, target # make sure we don't hold on to the graph metrics = dict(m.compute() for m in metrics) metrics.update({'loss': losses.avg}) metrics = dict(('val_'+k, v) if validate else ('train_'+k, v) for k, v in metrics.items()) return metrics
def stabilize_video(self, video, model, args): # optimizer = torch.optim.LBFGS([video.requires_grad_()]) if self.stabilization_target == 'video': params = [video.requires_grad_()] elif self.stabilization_target == 'network': decoder = ResNet503DDecoder.get(args) decoder = decoder.to(next(model.parameters()).device) params = decoder.parameters() elif self.stabilization_target == 'network2': decoder = ResNet503DDecoder2.get(args) decoder = decoder.to(next(model.parameters()).device) params = decoder.parameters() elif self.stabilization_target == 'network3': decoder = ResNet503DDecoder3.get(args) decoder = decoder.to(next(model.parameters()).device) params = decoder.parameters() elif self.stabilization_target == 'transformer': transformer = VideoStabilizer(64).to( next(model.parameters()).device) params = transformer.parameters() elif self.stabilization_target == 'deformer': transformer = VideoDeformer(64).to(next(model.parameters()).device) params = transformer.parameters() elif self.stabilization_target == 'tvdeformer': transformer = VideoTVDeformer(64).to( next(model.parameters()).device) params = transformer.parameters() elif self.stabilization_target == 'residualdeformer': transformer = VideoResidualDeformer(64).to( next(model.parameters()).device) params = transformer.parameters() elif self.stabilization_target == 'smoothdeformer': transformer = VideoSmoothDeformer(64).to( next(model.parameters()).device) params = transformer.parameters() elif self.stabilization_target == 'doubledeformer': transformer = VideoResidualDeformer(64).to( next(model.parameters()).device) motiontransformer = VideoTransformer(64).to( next(model.parameters()).device) params = list(transformer.parameters()) + list( motiontransformer.parameters()) elif self.stabilization_target == 'actualdoubledeformer': transformer = VideoResidualDeformer(64).to( next(model.parameters()).device) motiontransformer = VideoStabilizerConstrained(64 - 1).to( next(model.parameters()).device) params = list(transformer.parameters()) + list( motiontransformer.parameters()) elif self.stabilization_target == 'videotransformer': params = [video.requires_grad_()] transformer = VideoStabilizer(64).to( next(model.parameters()).device) params += list(transformer.parameters()) elif self.stabilization_target == 'sum': original_video = video.clone() params = [video.requires_grad_()] transformer = VideoStabilizer(64).to( next(model.parameters()).device) params += list(transformer.parameters()) elif self.stabilization_target == 'deep1': decoder = ResNet503DDecoder.get(args) #decoder = ResNet503DDecoder2.get(args) decoder = decoder.to(next(model.parameters()).device) params = list(decoder.parameters()) motiontransformer = VideoStabilizer(64 - 1).to( next(model.parameters()).device) params += list(motiontransformer.parameters()) elif self.stabilization_target == 'deep2': decoder = ResNet503DDecoder.get(args) #decoder = ResNet503DDecoder2.get(args) decoder = decoder.to(next(model.parameters()).device) params = list(decoder.parameters()) elif self.stabilization_target == 'deep3': decoder = ResNet503DDecoder.get(args) #decoder = ResNet503DDecoder2.get(args) decoder = decoder.to(next(model.parameters()).device) params = list(decoder.parameters()) else: assert False, "invalid stabilization target" optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.weight_decay) video_min, video_max = video.min().item(), video.max().item() target = model(video) target = OrderedDict((k, v.detach().clone()) for k, v in target.items()) # freeze targets timer = Timer() grid_loss = torch.zeros(1).cuda() for num_iter in range(args.epochs): optimizer.zero_grad() if self.stabilization_target == 'video': video.data.clamp_(video_min, video_max) output = model(self.augmentation(video)) video_transformed = video elif self.stabilization_target == 'network': video_transformed = decoder(target['layer4']) output = {} output['fc'] = target['fc'] output['layer1'] = target['layer1'] elif self.stabilization_target == 'network2': video_transformed = decoder(target['layer4']) output = {} output['fc'] = target['fc'] output['layer1'] = target['layer1'] elif self.stabilization_target == 'network3': video_transformed = decoder(target['layer2']) output = {} output['fc'] = target['fc'] output['layer1'] = target['layer1'] elif self.stabilization_target == 'transformer': video_transformed = transformer(video) output = model(video_transformed) elif self.stabilization_target == 'deformer': video_transformed = transformer(video) output = model(video_transformed) elif self.stabilization_target == 'tvdeformer': video_transformed, grid = transformer(video) grid_loss = ( F.mse_loss(grid[:, :-1, :, :], grid[:, 1:, :, :]) + F.mse_loss(grid[:, :, :-1, :], grid[:, :, 1:, :])) output = model(video_transformed) elif self.stabilization_target == 'residualdeformer': video_transformed, grid = transformer(video) grid_loss = (F.l1_loss(grid[:, :-1, :, :], grid[:, 1:, :, :]) + F.l1_loss(grid[:, :, :-1, :], grid[:, :, 1:, :])) output = model(video_transformed) elif self.stabilization_target == 'smoothdeformer': video_transformed, grid, affine_grid = transformer(video) grid_loss = ( F.l1_loss(grid[:, :-1, :, :], grid[:, 1:, :, :]) + F.l1_loss(grid[:, :, :-1, :], grid[:, :, 1:, :]) + F.mse_loss(grid[:-1, :, :, :], grid[1:, :, :, :]) + F.mse_loss(affine_grid[:-1, :], affine_grid[1:, :])) output = model(video_transformed) elif self.stabilization_target == 'doubledeformer': video_transformed, grid = transformer(video) grid_loss = (F.l1_loss(grid[:, :-1, :, :], grid[:, 1:, :, :]) + F.l1_loss(grid[:, :, :-1, :], grid[:, :, 1:, :])) output = model(video_transformed) elif self.stabilization_target == 'actualdoubledeformer': video_transformed, grid = transformer(video) video_motion, grid2 = motiontransformer( video_transformed[:, :-1, :, :, :]) identity = torch.Tensor([1, 0, 0, 0, 1, 0]).float().to(grid2.device) grid_loss = ( F.l1_loss(grid[:, :-1, :, :], grid[:, 1:, :, :]) + F.l1_loss(grid[:, :, :-1, :], grid[:, :, 1:, :]) + F.l1_loss(grid2[:-1, :], grid2[1:, :]) #F.l1_loss(grid2, identity[None, :].repeat(grid2.shape[0], 1)) ) output = model(video_transformed) elif self.stabilization_target == 'videotransformer': video.data.clamp_(video_min, video_max) video_transformed = transformer(video) output = model(self.augmentation(video_transformed)) elif self.stabilization_target == 'sum': video.data.clamp_(video_min, video_max) video_transformed = transformer(original_video) video_transformed += video output = model(self.augmentation(video_transformed)) elif self.stabilization_target == 'deep1': video_transformed = decoder(target['layer4']) output = {} output['fc'] = target['fc'] output['layer1'] = target['layer1'] elif self.stabilization_target == 'deep2': video_transformed = decoder(target['layer4']) output = {} output['fc'] = target['fc'] output['layer1'] = target['layer1'] elif self.stabilization_target == 'deep3': video_transformed = decoder(target['layer4']) output = {} output['fc'] = target['fc'] output['layer1'] = target['layer1'] else: assert False, "invalid stabilization target" mask = video.clone() mask[:] = 1 mask[:, :, 224 // 2 - 100 // 2:224 // 2 + 100 // 2, 224 // 2 - 100 // 2:224 // 2 + 100 // 2, :] = 0 content_loss = ((((video - video_transformed)**2) * mask).mean()).sqrt() style_loss = F.mse_loss(gram_matrix(output['layer1']), gram_matrix(target['layer1'])) if self.stabilization_target == 'doubledeformer': motion_loss = F.l1_loss( video_transformed[:, 1:, :, :, :], motiontransformer(video_transformed[:, :-1, :, :, :])) elif self.stabilization_target == 'actualdoubledeformer': motion_loss = F.l1_loss(video_transformed[:, 1:, :, :, :], video_motion) elif self.stabilization_target == 'deep1': motion_loss = F.l1_loss( video_transformed[:, 1:, :, :, :], motiontransformer(video[:, :-1, :, :, :])) motion_loss += F.l1_loss( video[:, 1:, :, :, :], motiontransformer(video[:, :-1, :, :, :])) else: motion_loss = F.l1_loss(video_transformed[:, 1:, :, :, :], video_transformed[:, :-1, :, :, :]) loss = (content_loss * self.content_weight + motion_loss * self.motion_weight + style_loss * self.style_weight + grid_loss * self.grid_weight) loss.backward() optimizer.step() timer.tic() if num_iter % args.print_freq == 0: print( ' Iter: [{0}/{1}]\t' 'Time {timer.val:.3f} ({timer.avg:.3f}) ' 'Content Loss: {2} \tMotion Loss: {3}\t Style Loss: {4}\t Grid Loss: {5}' .format(num_iter, args.epochs, content_loss.item(), motion_loss.item(), style_loss.item(), grid_loss.item(), timer=timer)) print('Stabilization Done') return video_transformed, content_loss.item(), motion_loss.item()
def stabilize_all(self, loader, model, epoch, args): timer = Timer() content_losses = AverageMeter() motion_losses = AverageMeter() original_losses = AverageMeter() output_losses = AverageMeter() for i, (inputs, target, meta) in enumerate(loader): if i >= self.num_videos: break if not args.cpu: inputs = inputs.cuda() target = target.cuda(async=True) original = inputs.detach().clone() with torch.enable_grad(): output, content_loss, motion_loss = self.stabilize_video( inputs, model, args) content_losses.update(content_loss) motion_losses.update(motion_loss) # prepare videos original = original[0] output = output[0] original *= torch.Tensor([0.229, 0.224, 0.225])[None, None, None, :].to(original.device) original += torch.Tensor([0.485, 0.456, 0.406])[None, None, None, :].to(original.device) output *= torch.Tensor([0.229, 0.224, 0.225])[None, None, None, :].to(output.device) output += torch.Tensor([0.485, 0.456, 0.406])[None, None, None, :].to(output.device) # save video name = '{}_{}'.format(meta[0]['id'], meta[0]['time']) ffmpeg_video_writer(original.cpu(), '{}/{}_original.mp4'.format(args.cache, name)) ffmpeg_video_writer(output.cpu(), '{}/{}_processed.mp4'.format(args.cache, name)) combined = torch.cat((original.cpu(), output.cpu()), 2) ffmpeg_video_writer(combined, '{}/{}_combined.mp4'.format(args.cache, name)) # calculate stability losses print('calculating stability losses') try: # this can fail when there are no feature matches found original_trajectory = video_trajectory(original.cpu().numpy()) original_losses.update(trajectory_loss(original_trajectory)) output_trajectory = video_trajectory(output.cpu().numpy()) output_losses.update(trajectory_loss(output_trajectory)) except Exception as e: print(e) timer.tic() print( 'Stabilization: [{0}/{1}]\t' 'Time {timer.val:.3f} ({timer.avg:.3f}) Original Loss {2} \t Output Loss {3}' .format(i, self.num_videos, original_losses.avg, output_losses.avg, timer=timer)) scores = { 'stabilization_task_content_loss': content_losses.avg, 'stabilization_task_motion_loss': motion_losses.avg, 'stabilization_task_original_loss': original_losses.avg, 'stabilization_task_output_loss': output_losses.avg } return scores
""" Video loader for the Charades dataset """ from datasets import utils from misc_utils.utils import Timer # path = '/home/SERILOCAL/xiatian.zhu/Data/test_video/ZZXQF-000002.jpg' path = '/home/nfs/x.chang/Datasets/Charades/Charades/Charades_v1_rgb/ZZXQF/ZZXQF-000002.jpg' for i in range(10): try: # ============ Temp =================== timer = Timer() img = utils.default_loader(path) # ============ Temp =================== load_img_cost = timer.thetime() - timer.end timer.tic() print('Load image from disk: {0:.3f} sec'.format(load_img_cost)) except Exception as e: print('failed to load image {}'.format(path)) print(e) raise