Example #1
0
 def fine_tune_autoencoder(self, inputs, model, args):
     model = copy.deepcopy(model)
     model.train()
     params = model.parameters()
     #lr = 1e-4
     #warmup = 10
     #optimizer = torch.optim.SGD(params, lr=lr, momentum=args.momentum, weight_decay=0)
     optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=0)
     criteria = AutoencoderCriterion(args)
     tol = 1e-2
     loss = torch.Tensor([999])
     timer = Timer()
     try:
         with torch.enable_grad():
             num_iter = 0
             while loss > tol:
                 #if num_iter > warmup:
                 #    lr = 1e-3
                 optimizer.zero_grad()
                 x_hat, code, x = model(inputs, None)
                 _, loss, _ = criteria(x_hat, code, x, None, None)
                 loss.backward()
                 optimizer.step()
                 num_iter += 1
                 timer.tic()
                 if num_iter % args.print_freq == 0:
                     print(
                         '    Iter: [{0}]\t'
                         'Time {timer.val:.3f} ({timer.avg:.3f}) Loss: {1}'.
                         format(num_iter, loss, timer=timer))
     except KeyboardInterrupt as e:
         print(e)
     return model, x_hat
Example #2
0
 def get_item(self, index, shift=None):
     ims, tars, meta = [], [], {}
     meta['do_not_collate'] = True
     fps = 24
     n = self.data['datas'][index]['n']
     if shift is None:
         shift = np.random.randint(n - self.train_gap - 2)
     else:
         shift = int(shift * (n - self.train_gap - 2))
     resize = transforms.Resize(int(256. / 224 * self.input_size))
     normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                      std=[0.229, 0.224, 0.225])
     spacing = np.arange(shift, shift + self.train_gap)
     for loc in spacing:
         ii = int(np.floor(loc))
         path = '{}{:06d}.jpg'.format(self.data['datas'][index]['base'],
                                      ii + 1)
         try:
             # ============ Temp ===================
             timer = Timer()
             img = default_loader(path)
             # ============ Temp ===================
             load_img_cost = timer.thetime() - timer.end
             timer.tic()
             print(
                 'Load image from disk: {0:.3f} sec'.format(load_img_cost))
         except Exception as e:
             print('failed to load image {}'.format(path))
             print(e)
             raise
         img = resize(img)
         img = transforms.ToTensor()(img)
         # ============ Temp ===================
         # totensor_cost = timer.thetime() - timer.end
         # timer.tic()
         # print('From PIL to tensor: {0:.3f} sec'.format(totensor_cost))
         #img = 2*img - 1
         img = normalize(img)
         ims.append(img)
         target = torch.IntTensor(self.num_classes).zero_()
         for x in self.data['datas'][index]['labels']:
             if x['start'] < ii / float(fps) < x['end']:
                 target[self.cls2int(x['class'])] = 1
         tars.append(target)
     meta['id'] = self.data['datas'][index]['id']
     meta['time'] = shift
     img = torch.stack(ims).permute(0, 2, 3, 1).numpy()
     target = torch.stack(tars)
     if self.transform is not None:
         img = self.transform(img)
         # ============ Temp ===================
         # transform_cost = timer.thetime() - timer.end
         # timer.tic()
         # print('Image transform per mini-batch: {0:.3f} sec'.format(transform_cost))
     if self.target_transform is not None:
         target = self.target_transform(target)
     # batch will be b x n x h x w x c
     # target will be b x n x nc
     return img, target, meta
Example #3
0
    def stabilize_all(self, loader, model, epoch, args):
        timer = Timer()
        for i, (inputs, target, meta) in enumerate(loader):
            if i >= self.num_videos:
                break
            if not args.cpu:
                inputs = inputs.cuda()
                target = target.cuda(async=True)
            original = inputs.detach().clone()
            reconstructed = model(inputs, None)[0]
            specific_model, fine_tuned = self.fine_tune_autoencoder(
                inputs, model, args)
            #fine_tuned = specific_model(inputs, None)[0]

            # prepare videos
            original = original[0]
            fine_tuned = fine_tuned[0]
            reconstructed = reconstructed[0]
            original *= torch.Tensor([0.229, 0.224,
                                      0.225])[None, None,
                                              None, :].to(original.device)
            original += torch.Tensor([0.485, 0.456,
                                      0.406])[None, None,
                                              None, :].to(original.device)
            fine_tuned *= torch.Tensor([0.229, 0.224,
                                        0.225])[None, None,
                                                None, :].to(original.device)
            fine_tuned += torch.Tensor([0.485, 0.456,
                                        0.406])[None, None,
                                                None, :].to(original.device)
            reconstructed *= torch.Tensor([0.229, 0.224,
                                           0.225])[None, None,
                                                   None, :].to(original.device)
            reconstructed += torch.Tensor([0.485, 0.456,
                                           0.406])[None, None,
                                                   None, :].to(original.device)

            # save video
            name = '{}_{}'.format(meta[0]['id'], meta[0]['time'])
            ffmpeg_video_writer(original.cpu(),
                                '{}/{}_original.mp4'.format(args.cache, name))
            ffmpeg_video_writer(fine_tuned.cpu(),
                                '{}/{}_finetuned.mp4'.format(args.cache, name))
            ffmpeg_video_writer(
                reconstructed.cpu(),
                '{}/{}_reconstructed.mp4'.format(args.cache, name))
            combined = torch.cat(
                (original.cpu(), reconstructed.cpu(), fine_tuned.cpu()), 2)
            ffmpeg_video_writer(combined,
                                '{}/{}_combined.mp4'.format(args.cache, name))

            timer.tic()
            print('Autoencoder: [{0}/{1}]\t'
                  'Time {timer.val:.3f} ({timer.avg:.3f})'.format(
                      i, self.num_videos, timer=timer))

        return {}
Example #4
0
def train(loader, model, optimizer, epoch, args):
    timer = Timer()
    data_time = AverageMeter()
    loss_meter = AverageMeter()
    ce_loss_meter = AverageMeter()
    cur_lr = adjust_learning_rate(args.lr_decay_rate, optimizer, epoch)
    model.train()
    optimizer.zero_grad()
    ce_loss_criterion = nn.CrossEntropyLoss()
    for i, (input, meta) in tqdm(enumerate(loader), desc="Train Epoch"):
        if args.debug and i >= debug_short_train_num:
            break
        data_time.update(timer.thetime() - timer.end)

        _batch_size = len(meta)
        target = []
        for _ in range(_batch_size):
            target.extend(meta[_]["labels"])
        target = torch.from_numpy(np.array(target))
        input = input.view(
            _batch_size * 3,
            input.shape[2],
            input.shape[3],
            input.shape[4],
            input.shape[5],
        )
        metric_feat, output = model(input)
        ce_loss = ce_loss_criterion(output.cuda(), target.long().cuda())
        loss = ce_loss

        loss.backward()
        loss_meter.update(loss.item())
        ce_loss_meter.update(ce_loss.item())
        if i % args.accum_grad == args.accum_grad - 1:
            optimizer.step()
            optimizer.zero_grad()

        if i % args.print_freq == 0 and i > 0:
            logger.info("[{0}][{1}/{2}]\t"
                        "Dataload_Time={data_time.avg:.3f}\t"
                        "Loss={loss.avg:.4f}\t"
                        "CELoss={ce_loss.avg:.4f}\t"
                        "LR={cur_lr:.7f}\t"
                        "bestAP={ap:.3f}".format(
                            epoch,
                            i,
                            len(loader),
                            data_time=data_time,
                            loss=loss_meter,
                            ce_loss=ce_loss_meter,
                            ap=args.best_score,
                            cur_lr=cur_lr,
                        ))
            loss_meter.reset()
            ce_loss_meter.reset()
Example #5
0
    def validate_video(self, loader, model, criterion, epoch, args):
        """ Run video-level validation on the test set """
        timer = Timer()
        ids, outputs = [], []
        for i, meta in enumerate(loader.dataset.data['datas']):
            ids.append(meta['id'])
        metrics = [m() for m in self.metrics]

        # switch to evaluate mode
        model.eval()
        criterion.eval()

        for i, (input, target, meta) in enumerate(loader):
            if not args.cpu:
                input = input.cuda()
                target = target.cuda(async=True)

            # split batch into smaller chunks
            if args.video_batch_size == -1:
                output = model(input, meta)
            else:
                output_chunks = []
                for chunk in input.split(args.video_batch_size):
                    output_chunks.append(model(chunk, meta))
                output = gather(output_chunks, input.device)

            if type(output) != tuple:
                output = (output, )
            scores, loss, score_target = criterion(*(output + (target, meta)),
                                                   synchronous=True)
            for m in metrics:
                m.update(scores, score_target)

            # store predictions
            scores_video = scores.max(dim=0)[0]
            outputs.append(scores_video.cpu())
            # ids.append(meta['id'][0])
            timer.tic()
            if i % args.print_freq == 0:
                print('[{name}] {task}: [{0}/{1}]\t'
                      'Time {timer.val:.3f} ({timer.avg:.3f})\t'
                      '{metrics}'.format(i,
                                         len(loader),
                                         timer=timer,
                                         name=args.name,
                                         task=self.name,
                                         metrics=' \t'.join(
                                             str(m) for m in metrics)))
            del loss, output, target  # make sure we don't hold on to the graph
        submission_file(ids, outputs,
                        '{}/epoch_{:03d}.txt'.format(args.cache, epoch + 1))
        metrics = dict(m.compute() for m in metrics)
        metrics = dict((self.name + '_' + k, v) for k, v in metrics.items())
        print(metrics)
        return metrics
def read_multi_images(target_dir):
    timer = Timer()
    image_num = 0
    for f in listdir(target_dir):
        file_full_path = join(target_dir, f)
        if isfile(file_full_path):
            read_one_iamge(file_full_path)
            image_num += 1

    time_cost = timer.thetime() - timer.end
    print('Load images from disk: {0:.3f} sec'.format(time_cost))
    return time_cost, image_num
Example #7
0
    def alignment(self, loader, model, epoch, args, task=best_one_sec_moment):
        timer = Timer()
        abssec = MedianMeter()
        abssec0 = MedianMeter()
        randsec = MedianMeter()
        model = ActorObserverFC7Wrapper(model, args)
        model = set_distributed_backend(model, args)

        # switch to evaluate mode
        model.eval()

        def fc7_generator():
            for i, (inputs, target, meta) in enumerate(loader):
                if not args.cpu:
                    target = target.cuda(async=True)
                first_fc7, third_fc7, w_x, w_y = model(*inputs)
                timer.tic()
                if i % args.print_freq == 0:
                    print('Alignment: [{0}/{1}]\t'
                          'Time {timer.val:.3f} ({timer.avg:.3f})'.format(
                              i, len(loader), timer=timer))
                for vid, o1, o2 in zip(meta['id'], first_fc7, third_fc7):
                    yield vid, (o1.cpu().numpy(), o2.cpu().numpy())

        for key, grp in groupby(fc7_generator(), key=lambda x: x[0]):
            print('processing id: {}'.format(key))
            _, mat = fc7list2mat(grp)
            _, _, _, j, gt = task(mat, winsize=3)
            _, _, _, j0, gt0 = task(mat, winsize=0)
            _, _, _, jr, gtr = task(np.random.randn(*mat.shape), winsize=3)
            abssec.update(abs(j - gt))
            abssec0.update(abs(j0 - gt0))
            randsec.update(abs(jr - gtr))
            print(self.name)
            print('  abs3: {abs3.val:.3f} ({abs3.avg:.3f}) [{abs3.med:.3f}]'
                  '  abs0: {abs0.val:.3f} ({abs0.avg:.3f}) [{abs0.med:.3f}]'
                  '\n'
                  '  absr: {absr.val:.3f} ({absr.avg:.3f}) [{absr.med:.3f}]'.
                  format(abs3=abssec, abs0=abssec0, absr=randsec))
        scores = {
            self.name + '_1sec': abssec0.med,
            self.name + '_1sec_smooth': abssec.med,
            self.name + '_1sec_random': randsec.med
        }
        return scores
Example #8
0
    def train(self, loader, model, criterion, optimizer, epoch, metrics, args, validate=False):
        timer = Timer()
        data_time = AverageMeter()
        losses = AverageMeter()
        metrics = [m() for m in metrics]

        if validate:
            # switch to evaluate mode
            model.eval()
            criterion.eval()
            iter_size = args.val_size
            setting = 'Validate Epoch'
        else:
            # switch to train mode
            adjust_learning_rate(args.lr, args.lr_decay_rate, optimizer, epoch)
            model.train()
            criterion.train()
            optimizer.zero_grad()
            iter_size = args.train_size
            setting = 'Train Epoch'

        for i, (input, target, meta) in enumerate(part(loader, iter_size)):
            if args.synchronous:
                assert meta['id'][0] == meta['id'][1], "dataset not synced"
            data_time.update(timer.thetime() - timer.end)

            if not args.cpu:
                target = target.cuda(non_blocking=True)
            output = model(input, meta)
            if type(output) != tuple:
                output = (output,)
            scores, loss, score_target = criterion(*(output + (target, meta)))
            losses.update(loss.item())
            with torch.no_grad():
                for m in metrics:
                    m.update(scores, score_target)

            if not validate:
                loss.backward()
                if i % args.accum_grad == args.accum_grad-1:
                    print('updating parameters')
                    optimizer.step()
                    optimizer.zero_grad()

            timer.tic()
            if i % args.print_freq == 0:
                print('[{name}] {setting}: [{0}][{1}/{2}({3})]\t'
                      'Time {timer.val:.3f} ({timer.avg:.3f})\t'
                      'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                      'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                      '{metrics}'.format(
                          epoch, i, int(len(loader)*iter_size), len(loader),
                          name=args.name, setting=setting, timer=timer,
                          data_time=data_time, loss=losses,
                          metrics=' \t'.join(str(m) for m in metrics)))
            del loss, output, target  # make sure we don't hold on to the graph

        metrics = dict(m.compute() for m in metrics)
        metrics.update({'loss': losses.avg})
        metrics = dict(('val_'+k, v) if validate else ('train_'+k, v) for k, v in metrics.items())
        return metrics
Example #9
0
    def stabilize_video(self, video, model, args):
        # optimizer = torch.optim.LBFGS([video.requires_grad_()])
        if self.stabilization_target == 'video':
            params = [video.requires_grad_()]
        elif self.stabilization_target == 'network':
            decoder = ResNet503DDecoder.get(args)
            decoder = decoder.to(next(model.parameters()).device)
            params = decoder.parameters()
        elif self.stabilization_target == 'network2':
            decoder = ResNet503DDecoder2.get(args)
            decoder = decoder.to(next(model.parameters()).device)
            params = decoder.parameters()
        elif self.stabilization_target == 'network3':
            decoder = ResNet503DDecoder3.get(args)
            decoder = decoder.to(next(model.parameters()).device)
            params = decoder.parameters()
        elif self.stabilization_target == 'transformer':
            transformer = VideoStabilizer(64).to(
                next(model.parameters()).device)
            params = transformer.parameters()
        elif self.stabilization_target == 'deformer':
            transformer = VideoDeformer(64).to(next(model.parameters()).device)
            params = transformer.parameters()
        elif self.stabilization_target == 'tvdeformer':
            transformer = VideoTVDeformer(64).to(
                next(model.parameters()).device)
            params = transformer.parameters()
        elif self.stabilization_target == 'residualdeformer':
            transformer = VideoResidualDeformer(64).to(
                next(model.parameters()).device)
            params = transformer.parameters()
        elif self.stabilization_target == 'smoothdeformer':
            transformer = VideoSmoothDeformer(64).to(
                next(model.parameters()).device)
            params = transformer.parameters()
        elif self.stabilization_target == 'doubledeformer':
            transformer = VideoResidualDeformer(64).to(
                next(model.parameters()).device)
            motiontransformer = VideoTransformer(64).to(
                next(model.parameters()).device)
            params = list(transformer.parameters()) + list(
                motiontransformer.parameters())
        elif self.stabilization_target == 'actualdoubledeformer':
            transformer = VideoResidualDeformer(64).to(
                next(model.parameters()).device)
            motiontransformer = VideoStabilizerConstrained(64 - 1).to(
                next(model.parameters()).device)
            params = list(transformer.parameters()) + list(
                motiontransformer.parameters())
        elif self.stabilization_target == 'videotransformer':
            params = [video.requires_grad_()]
            transformer = VideoStabilizer(64).to(
                next(model.parameters()).device)
            params += list(transformer.parameters())
        elif self.stabilization_target == 'sum':
            original_video = video.clone()
            params = [video.requires_grad_()]
            transformer = VideoStabilizer(64).to(
                next(model.parameters()).device)
            params += list(transformer.parameters())
        elif self.stabilization_target == 'deep1':
            decoder = ResNet503DDecoder.get(args)
            #decoder = ResNet503DDecoder2.get(args)
            decoder = decoder.to(next(model.parameters()).device)
            params = list(decoder.parameters())
            motiontransformer = VideoStabilizer(64 - 1).to(
                next(model.parameters()).device)
            params += list(motiontransformer.parameters())
        elif self.stabilization_target == 'deep2':
            decoder = ResNet503DDecoder.get(args)
            #decoder = ResNet503DDecoder2.get(args)
            decoder = decoder.to(next(model.parameters()).device)
            params = list(decoder.parameters())
        elif self.stabilization_target == 'deep3':
            decoder = ResNet503DDecoder.get(args)
            #decoder = ResNet503DDecoder2.get(args)
            decoder = decoder.to(next(model.parameters()).device)
            params = list(decoder.parameters())
        else:
            assert False, "invalid stabilization target"

        optimizer = torch.optim.Adam(params,
                                     lr=args.lr,
                                     weight_decay=args.weight_decay)
        video_min, video_max = video.min().item(), video.max().item()
        target = model(video)
        target = OrderedDict((k, v.detach().clone())
                             for k, v in target.items())  # freeze targets
        timer = Timer()
        grid_loss = torch.zeros(1).cuda()
        for num_iter in range(args.epochs):
            optimizer.zero_grad()

            if self.stabilization_target == 'video':
                video.data.clamp_(video_min, video_max)
                output = model(self.augmentation(video))
                video_transformed = video
            elif self.stabilization_target == 'network':
                video_transformed = decoder(target['layer4'])
                output = {}
                output['fc'] = target['fc']
                output['layer1'] = target['layer1']
            elif self.stabilization_target == 'network2':
                video_transformed = decoder(target['layer4'])
                output = {}
                output['fc'] = target['fc']
                output['layer1'] = target['layer1']
            elif self.stabilization_target == 'network3':
                video_transformed = decoder(target['layer2'])
                output = {}
                output['fc'] = target['fc']
                output['layer1'] = target['layer1']
            elif self.stabilization_target == 'transformer':
                video_transformed = transformer(video)
                output = model(video_transformed)
            elif self.stabilization_target == 'deformer':
                video_transformed = transformer(video)
                output = model(video_transformed)
            elif self.stabilization_target == 'tvdeformer':
                video_transformed, grid = transformer(video)
                grid_loss = (
                    F.mse_loss(grid[:, :-1, :, :], grid[:, 1:, :, :]) +
                    F.mse_loss(grid[:, :, :-1, :], grid[:, :, 1:, :]))
                output = model(video_transformed)
            elif self.stabilization_target == 'residualdeformer':
                video_transformed, grid = transformer(video)
                grid_loss = (F.l1_loss(grid[:, :-1, :, :], grid[:, 1:, :, :]) +
                             F.l1_loss(grid[:, :, :-1, :], grid[:, :, 1:, :]))
                output = model(video_transformed)
            elif self.stabilization_target == 'smoothdeformer':
                video_transformed, grid, affine_grid = transformer(video)
                grid_loss = (
                    F.l1_loss(grid[:, :-1, :, :], grid[:, 1:, :, :]) +
                    F.l1_loss(grid[:, :, :-1, :], grid[:, :, 1:, :]) +
                    F.mse_loss(grid[:-1, :, :, :], grid[1:, :, :, :]) +
                    F.mse_loss(affine_grid[:-1, :], affine_grid[1:, :]))
                output = model(video_transformed)
            elif self.stabilization_target == 'doubledeformer':
                video_transformed, grid = transformer(video)
                grid_loss = (F.l1_loss(grid[:, :-1, :, :], grid[:, 1:, :, :]) +
                             F.l1_loss(grid[:, :, :-1, :], grid[:, :, 1:, :]))
                output = model(video_transformed)
            elif self.stabilization_target == 'actualdoubledeformer':
                video_transformed, grid = transformer(video)
                video_motion, grid2 = motiontransformer(
                    video_transformed[:, :-1, :, :, :])
                identity = torch.Tensor([1, 0, 0, 0, 1,
                                         0]).float().to(grid2.device)
                grid_loss = (
                    F.l1_loss(grid[:, :-1, :, :], grid[:, 1:, :, :]) +
                    F.l1_loss(grid[:, :, :-1, :], grid[:, :, 1:, :]) +
                    F.l1_loss(grid2[:-1, :], grid2[1:, :])
                    #F.l1_loss(grid2, identity[None, :].repeat(grid2.shape[0], 1))
                )
                output = model(video_transformed)
            elif self.stabilization_target == 'videotransformer':
                video.data.clamp_(video_min, video_max)
                video_transformed = transformer(video)
                output = model(self.augmentation(video_transformed))
            elif self.stabilization_target == 'sum':
                video.data.clamp_(video_min, video_max)
                video_transformed = transformer(original_video)
                video_transformed += video
                output = model(self.augmentation(video_transformed))
            elif self.stabilization_target == 'deep1':
                video_transformed = decoder(target['layer4'])
                output = {}
                output['fc'] = target['fc']
                output['layer1'] = target['layer1']
            elif self.stabilization_target == 'deep2':
                video_transformed = decoder(target['layer4'])
                output = {}
                output['fc'] = target['fc']
                output['layer1'] = target['layer1']
            elif self.stabilization_target == 'deep3':
                video_transformed = decoder(target['layer4'])
                output = {}
                output['fc'] = target['fc']
                output['layer1'] = target['layer1']
            else:
                assert False, "invalid stabilization target"

            mask = video.clone()
            mask[:] = 1
            mask[:, :, 224 // 2 - 100 // 2:224 // 2 + 100 // 2,
                 224 // 2 - 100 // 2:224 // 2 + 100 // 2, :] = 0
            content_loss = ((((video - video_transformed)**2) *
                             mask).mean()).sqrt()

            style_loss = F.mse_loss(gram_matrix(output['layer1']),
                                    gram_matrix(target['layer1']))

            if self.stabilization_target == 'doubledeformer':
                motion_loss = F.l1_loss(
                    video_transformed[:, 1:, :, :, :],
                    motiontransformer(video_transformed[:, :-1, :, :, :]))
            elif self.stabilization_target == 'actualdoubledeformer':
                motion_loss = F.l1_loss(video_transformed[:, 1:, :, :, :],
                                        video_motion)
            elif self.stabilization_target == 'deep1':
                motion_loss = F.l1_loss(
                    video_transformed[:, 1:, :, :, :],
                    motiontransformer(video[:, :-1, :, :, :]))
                motion_loss += F.l1_loss(
                    video[:, 1:, :, :, :],
                    motiontransformer(video[:, :-1, :, :, :]))
            else:
                motion_loss = F.l1_loss(video_transformed[:, 1:, :, :, :],
                                        video_transformed[:, :-1, :, :, :])

            loss = (content_loss * self.content_weight +
                    motion_loss * self.motion_weight +
                    style_loss * self.style_weight +
                    grid_loss * self.grid_weight)
            loss.backward()
            optimizer.step()
            timer.tic()
            if num_iter % args.print_freq == 0:
                print(
                    '    Iter: [{0}/{1}]\t'
                    'Time {timer.val:.3f} ({timer.avg:.3f}) '
                    'Content Loss: {2} \tMotion Loss: {3}\t Style Loss: {4}\t Grid Loss: {5}'
                    .format(num_iter,
                            args.epochs,
                            content_loss.item(),
                            motion_loss.item(),
                            style_loss.item(),
                            grid_loss.item(),
                            timer=timer))
        print('Stabilization Done')
        return video_transformed, content_loss.item(), motion_loss.item()
Example #10
0
    def stabilize_all(self, loader, model, epoch, args):
        timer = Timer()
        content_losses = AverageMeter()
        motion_losses = AverageMeter()
        original_losses = AverageMeter()
        output_losses = AverageMeter()
        for i, (inputs, target, meta) in enumerate(loader):
            if i >= self.num_videos:
                break
            if not args.cpu:
                inputs = inputs.cuda()
                target = target.cuda(async=True)
            original = inputs.detach().clone()
            with torch.enable_grad():
                output, content_loss, motion_loss = self.stabilize_video(
                    inputs, model, args)
            content_losses.update(content_loss)
            motion_losses.update(motion_loss)

            # prepare videos
            original = original[0]
            output = output[0]
            original *= torch.Tensor([0.229, 0.224,
                                      0.225])[None, None,
                                              None, :].to(original.device)
            original += torch.Tensor([0.485, 0.456,
                                      0.406])[None, None,
                                              None, :].to(original.device)
            output *= torch.Tensor([0.229, 0.224,
                                    0.225])[None, None,
                                            None, :].to(output.device)
            output += torch.Tensor([0.485, 0.456,
                                    0.406])[None, None,
                                            None, :].to(output.device)

            # save video
            name = '{}_{}'.format(meta[0]['id'], meta[0]['time'])
            ffmpeg_video_writer(original.cpu(),
                                '{}/{}_original.mp4'.format(args.cache, name))
            ffmpeg_video_writer(output.cpu(),
                                '{}/{}_processed.mp4'.format(args.cache, name))
            combined = torch.cat((original.cpu(), output.cpu()), 2)
            ffmpeg_video_writer(combined,
                                '{}/{}_combined.mp4'.format(args.cache, name))

            # calculate stability losses
            print('calculating stability losses')
            try:
                # this can fail when there are no feature matches found
                original_trajectory = video_trajectory(original.cpu().numpy())
                original_losses.update(trajectory_loss(original_trajectory))
                output_trajectory = video_trajectory(output.cpu().numpy())
                output_losses.update(trajectory_loss(output_trajectory))
            except Exception as e:
                print(e)
            timer.tic()
            print(
                'Stabilization: [{0}/{1}]\t'
                'Time {timer.val:.3f} ({timer.avg:.3f}) Original Loss {2} \t Output Loss {3}'
                .format(i,
                        self.num_videos,
                        original_losses.avg,
                        output_losses.avg,
                        timer=timer))

        scores = {
            'stabilization_task_content_loss': content_losses.avg,
            'stabilization_task_motion_loss': motion_losses.avg,
            'stabilization_task_original_loss': original_losses.avg,
            'stabilization_task_output_loss': output_losses.avg
        }
        return scores
""" Video loader for the Charades dataset """

from datasets import utils
from misc_utils.utils import Timer

# path = '/home/SERILOCAL/xiatian.zhu/Data/test_video/ZZXQF-000002.jpg'
path = '/home/nfs/x.chang/Datasets/Charades/Charades/Charades_v1_rgb/ZZXQF/ZZXQF-000002.jpg'

for i in range(10):
    try:
        # ============ Temp ===================
        timer = Timer()
        img = utils.default_loader(path)
        # ============ Temp ===================
        load_img_cost = timer.thetime() - timer.end
        timer.tic()
        print('Load image from disk: {0:.3f} sec'.format(load_img_cost))
    except Exception as e:
        print('failed to load image {}'.format(path))
        print(e)
        raise