Exemple #1
0
def get_video_transform(data_name, split_name, opt):
    normalizer = video_transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                            std=[0.229, 0.224, 0.225])
    t_list = []
    if split_name == 'train':
        t_list = [
            video_transforms.RandomResizedCrop(opt.crop_size),
            video_transforms.RandomHorizontalFlip(),
            video_transforms.ColorJitter(brightness=0.1, contrast=0.1, hue=0.1)
        ]
    else:
        t_list = [
            video_transforms.Resize(256),
            video_transforms.CenterCrop(opt.crop_size)
        ]

    t_end = [video_transforms.ToTensor(), normalizer]
    transform = video_transforms.Compose(t_list + t_end)
    return transform
    }
    assert CONFIG["RGB_I3D_LOAD_MODEL_PATH"] or CONFIG[
        "FLOW_I3D_LOAD_MODEL_PATH"]

    # Setup Dataset and Dataloader
    if CONFIG["DATASET"] == "original":
        Dataset = bbdb_dataset.OriginalBBDBDataset
    elif CONFIG["DATASET"] == "binary":
        Dataset = bbdb_dataset.BinaryBBDBDataset
    else:
        assert False

    with open("data_split.min.json", "r") as fp:
        data_split = json.load(fp)
    test_transforms = transforms.Compose([
        video_transforms.Resize(256),
        video_transforms.CenterCrop(224),
    ])
    dataset = Dataset(segment_filepaths=data_split["test"],
                      segment_length=CONFIG["SEGMENT_LENGTH"],
                      frameskip=CONFIG["FRAMESKIP"],
                      transform=test_transforms)
    dataloader = DataLoader(dataset,
                            batch_size=CONFIG["BATCH_SIZE"],
                            pin_memory=True)

    # Setup I3D
    # TODO(seungjaeryanlee): Allow choosing both
    if CONFIG["RGB_I3D_LOAD_MODEL_PATH"]:
        rgb_i3d = InceptionI3d(400, in_channels=3)
        rgb_i3d.replace_logits(dataset.NUM_LABELS)
Exemple #3
0
import os
import shutil
import math
import numpy as np
from PIL import Image

from tsn_dataset import TSNDataSet
from p3d_model import P3D199,get_optim_policies
import video_transforms

from tsn_models import TSN
from torch.nn.utils import clip_grad_norm

val_transform=video_transforms.Compose(
    [
        video_transforms.Resize((182,242)),
        video_transforms.CenterCrop(160),
        video_transforms.ToTensor(),
        video_transforms.Normalize((0.485,0.456,0.406),
                      (0.229,0.224,0.225))]
)

val_loader=torch.utils.data.DataLoader(
    TSNDataSet("","tsntest_01.lst",
               num_segments=2,
               new_length=16,
               modality="RGB",
               image_tmpl="frame{:06d}.jpg",
               transform=val_transform,
               random_shift=False),
    batch_size=1,
def main():
    global args, best_acc1
    args = parser.parse_args()
    num_classes = args.num_classes
    start_epoch=0
    writer = SummaryWriter(args.logdir)

    model = build_model(num_classes=num_classes, input_length=args.new_length)

    print(model)

    # create model
    print("Building model ... ")

    model = torch.nn.DataParallel(model)
    model.cuda()

    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)
    print("Saving everything to directory %s." % (args.out_dir))

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    scheduler = ReduceLROnPlateau(optimizer, verbose=True, patience=4)

    # if resume set to True, load the model and continue training
    if args.resume or args.evaluate:
        if os.path.isfile(args.model_path):
            model, optimizer, start_epoch = load_checkpoint(model, optimizer, args.model_path)

    cudnn.benchmark = True

    is_color = True
    # scale_ratios = [1.0, 0.875, 0.75, 0.66]
    clip_mean = {'rgb': [0.485, 0.456, 0.406] * args.new_length, 'flow': [0.9432, 0.9359, 0.9511] *args.new_length,
                 'skeleton': [0.0071, 0.0078, 0.0079]*args.new_length}
    clip_std = {'rgb': [0.229, 0.224, 0.225] * args.new_length, 'flow': [0.0788, 0.0753, 0.0683] * args.new_length,
                'skeleton': [0.0581, 0.0621, 0.0623] * args.new_length}

    normalize = video_transforms.Normalize(mean=clip_mean,
                                           std=clip_std)
    train_transform = video_transforms.Compose([
            video_transforms.Resize((args.new_width, args.new_height)),
            video_transforms.ToTensor(),
            normalize,
        ])

    val_transform = video_transforms.Compose([
            video_transforms.Resize((args.new_width, args.new_height)),
            video_transforms.ToTensor(),
            normalize,
        ])

    train_dataset = datasets.__dict__[args.dataset](root=args.data,
                                                    source=args.train_split_file,
                                                    phase="train",
                                                    is_color=is_color,
                                                    new_length=args.new_length,
                                                    video_transform=train_transform)
    val_dataset = datasets.__dict__[args.dataset](root=args.data,
                                                  source=args.test_split_file,
                                                  phase="val",
                                                  is_color=is_color,
                                                  new_length=args.new_length,
                                                  video_transform=val_transform,
                                                  return_id=True)

    print('{} samples found, {} train samples and {} test samples.'.format(len(val_dataset)+len(train_dataset),
                                                                           len(train_dataset),
                                                                           len(val_dataset)))

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=args.batch_size, shuffle=True,
        num_workers=args.workers, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=args.batch_size, shuffle=False,
        num_workers=args.workers, pin_memory=True)

    if args.evaluate:

        validate(val_loader, model, criterion, epoch=0, writer=writer, classes=val_dataset.classes)
        return

    for epoch in range(start_epoch, args.epochs):
        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args, writer)

        # evaluate on validation set
        acc1, loss = validate(val_loader, model, criterion, epoch, writer)
        scheduler.step(loss, epoch=epoch)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        save_checkpoint({
                'epoch': epoch + 1,
                'arch': 'ThreeStreamTemporal',
                'state_dict': model.state_dict(),
                'best_acc1': best_acc1,
                'optimizer': optimizer.state_dict(),
                'scheduler': scheduler.state_dict(),
            }, is_best, 'last_checkpoint.pth.tar', args.out_dir)

    writer.close()
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    args = parser.parse_args()
    set_logger(log_file=args.log_file, debug_mode=args.debug_mode)

    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed(args.random_seed)
    cudnn.benchmark = True

    mean = [124 / 255, 117 / 255, 104 / 255]
    std = [1 / (.0167 * 255)] * 3
    normalize = transforms.Normalize(mean=mean, std=std)

    train_loader = VideoIterTrain(
        dataset_path=args.dataset_path,
        annotation_path=args.annotation_path,
        clip_length=args.clip_length,
        frame_interval=args.train_frame_interval,
        video_transform=transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.RandomCrop((224, 224)),
            transforms.ToTensor(),
            normalize,
        ]),
        name='train',
        return_item_subpath=False,
    )

    train_iter = torch.utils.data.DataLoader(
        train_loader,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=32,  # 4, # change this part accordingly
        pin_memory=True)

    val_loader = VideoIterTrain(
        dataset_path=args.dataset_path,
        annotation_path=args.annotation_path_test,
        clip_length=args.clip_length,
        frame_interval=args.val_frame_interval,
        video_transform=transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.RandomCrop((224, 224)),
            transforms.ToTensor(),
            normalize,
        ]),
        name='val',
        return_item_subpath=False,
    )

    val_iter = torch.utils.data.DataLoader(
        val_loader,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=32,  # 4, # change this part accordingly
        pin_memory=True)

    network = C3D(pretrained=args.pretrained_3d)
    network.to(device)

    if not path.exists(features_dir):
        mkdir(features_dir)

    features_writer = FeaturesWriter()

    for i_batch, (data, target, sampled_idx, dirs,
                  vid_names) in tqdm(enumerate(train_iter)):
        data = data.to(device)
        with torch.no_grad():
            input_var = torch.autograd.Variable(data)
            outputs = network(input_var)

            for i, (dir, vid_name, start_frame) in enumerate(
                    zip(dirs, vid_names,
                        sampled_idx.cpu().numpy())):
                dir = path.join(features_dir, dir)
                features_writer.write(feature=outputs[i],
                                      video_name=vid_name,
                                      start_frame=start_frame,
                                      dir=dir)

    features_writer.dump()

    features_writer = FeaturesWriter()
    for i_batch, (data, target, sampled_idx, dirs,
                  vid_names) in tqdm(enumerate(val_iter)):
        data = data.to(device)
        with torch.no_grad():
            input_var = torch.autograd.Variable(data)
            outputs = network(input_var)

            for i, (dir, vid_name, start_frame) in enumerate(
                    zip(dirs, vid_names,
                        sampled_idx.cpu().numpy())):
                dir = path.join(features_dir, dir)
                features_writer.write(feature=outputs[i],
                                      video_name=vid_name,
                                      start_frame=start_frame,
                                      dir=dir)

    features_writer.dump()
Exemple #6
0
def extract_from_three_stream(args):
    model = main_three_stream.build_model(num_classes=args.num_classes,
                                          input_length=args.new_length)
    # create model
    print("Building model ... ")
    model = torch.nn.DataParallel(model)
    model = model.to(args.device)

    # define loss function (criterion) and optimizer

    if os.path.isfile(args.vision_model_path):
        model, _, start_epoch = main_three_stream.load_checkpoint(
            model, None, args.vision_model_path)
    is_color = True

    clip_mean = {
        'rgb': [0.485, 0.456, 0.406] * args.new_length,
        'flow': [0.9432, 0.9359, 0.9511] * args.new_length,
        'skeleton': [0.0071, 0.0078, 0.0079] * args.new_length
    }
    clip_std = {
        'rgb': [0.229, 0.224, 0.225] * args.new_length,
        'flow': [0.0788, 0.0753, 0.0683] * args.new_length,
        'skeleton': [0.0581, 0.0621, 0.0623] * args.new_length
    }

    normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std)
    train_transform = video_transforms.Compose([
        video_transforms.Resize((args.new_width, args.new_height)),
        video_transforms.ToTensor(),
        normalize,
    ])

    val_transform = video_transforms.Compose([
        video_transforms.Resize((args.new_width, args.new_height)),
        video_transforms.ToTensor(),
        normalize,
    ])

    train_dataset = datasets.__dict__[args.dataset](
        root=args.data,
        source=args.train_split_file,
        phase="train",
        is_color=is_color,
        new_length=args.new_length,
        video_transform=train_transform,
        return_id=True)
    val_dataset = datasets.__dict__[args.dataset](
        root=args.data,
        source=args.test_split_file,
        phase="val",
        is_color=is_color,
        new_length=args.new_length,
        video_transform=val_transform,
        return_id=True)

    print('{} samples found, {} train samples and {} test samples.'.format(
        len(val_dataset) + len(train_dataset), len(train_dataset),
        len(val_dataset)))

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=1,
                                               shuffle=True,
                                               num_workers=args.workers,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    print("Extracting train visual representations")
    outputs_clip_train = infer_three_stream(train_loader,
                                            model,
                                            classes=val_dataset.classes)
    pickle.dump(outputs_clip_train,
                open(args.visual_representations_train, 'wb'))

    print("Extracting validation visual representations")
    outputs_clip_val = infer_three_stream(val_loader,
                                          model,
                                          classes=val_dataset.classes)
    pickle.dump(outputs_clip_val, open(args.visual_representations_val, 'wb'))

    return outputs_clip_train, outputs_clip_val