Esempio n. 1
0
def get_val_utils(opt):
    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)

    if opt.train_crop == 'other':
        spatial_transform = [
            Resize((opt.scale_h, opt.scale_w)),
            RandomCrop(opt.sample_size),
            ToTensor()
        ]
    else:
        spatial_transform = [
            Resize(opt.sample_size),
            CenterCrop(opt.sample_size),
            ToTensor()
        ]

    if opt.input_type == 'flow':
        spatial_transform.append(PickFirstChannels(n=2))
    spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
    spatial_transform = Compose(spatial_transform)

    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))

    temporal_transform.append(
        TemporalEvenCrop(opt.sample_duration, opt.n_val_samples))

    temporal_transform = TemporalCompose(temporal_transform)

    val_data, collate_fn = get_validation_data(
        opt.video_path, opt.annotation_path, opt.dataset, opt.input_type,
        opt.file_type, spatial_transform, temporal_transform)

    if opt.distributed:
        val_sampler = torch.utils.data.distributed.DistributedSampler(
            val_data, shuffle=False)
    else:
        val_sampler = None

    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=(opt.batch_size //
                                                         opt.n_val_samples),
                                             shuffle=False,
                                             num_workers=opt.n_threads,
                                             pin_memory=True,
                                             sampler=val_sampler,
                                             worker_init_fn=worker_init_fn,
                                             collate_fn=collate_fn)

    if opt.is_master_node:
        val_logger = Logger(opt.result_path / 'val.log',
                            ['epoch', 'loss', 'acc', 'acc_num'])
    else:
        val_logger = None
    return val_loader, val_logger
Esempio n. 2
0
def get_loaders(opt):
    """ Make dataloaders for train and validation sets
	"""
    # train loader
    norm_method = Normalize(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
    spatial_transform = Compose([
        Scale((opt.sample_size, opt.sample_size)),
        Resize(256),
        CenterCrop(224),
        ToTensor(), norm_method
    ])
    temporal_transform = TemporalRandomCrop(25)
    target_transform = ClassLabel()
    training_data = get_training_set(opt, spatial_transform,
                                     temporal_transform, target_transform)
    train_loader = torch.utils.data.DataLoader(training_data,
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True)

    # validation loader
    target_transform = ClassLabel()
    temporal_transform = LoopPadding(25)
    validation_data = get_validation_set(opt, spatial_transform,
                                         temporal_transform, target_transform)
    val_loader = torch.utils.data.DataLoader(validation_data,
                                             batch_size=opt.batch_size,
                                             shuffle=False,
                                             num_workers=opt.num_workers,
                                             pin_memory=True)
    return train_loader, val_loader
Esempio n. 3
0
def get_inference_utils(opt):
    assert opt.inference_crop in ['center', 'nocrop']

    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)

    spatial_transform = [Resize(opt.sample_size)]
    if opt.inference_crop == 'center':
        spatial_transform.append(CenterCrop(opt.sample_size))
    spatial_transform.extend(
        [ToTensor(), ScaleValue(opt.value_scale), normalize])
    spatial_transform = Compose(spatial_transform)

    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
    temporal_transform.append(
        SlidingWindow(opt.sample_duration, opt.inference_stride))
    temporal_transform = TemporalCompose(temporal_transform)

    inference_data, collate_fn = get_inference_data(
        opt.video_path, opt.annotation_path, opt.dataset, opt.file_type,
        opt.inference_subset, spatial_transform, temporal_transform)

    inference_loader = torch.utils.data.DataLoader(
        inference_data,
        batch_size=opt.inference_batch_size,
        shuffle=False,
        num_workers=opt.n_threads,
        pin_memory=True,
        worker_init_fn=worker_init_fn,
        collate_fn=collate_fn)

    return inference_loader, inference_data.class_names
Esempio n. 4
0
def get_inference_utils(opt):
    assert opt.inference_crop in ['center', 'nocrop']

    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)

    spatial_transform = [Resize(opt.sample_size)]
    if opt.inference_crop == 'center':
        spatial_transform.append(CenterCrop(opt.sample_size))
    spatial_transform.append(ToTensor())
    if opt.input_type == 'flow':
        spatial_transform.append(PickFirstChannels(n=2))
    spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
    spatial_transform = Compose(spatial_transform)

    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
    temporal_transform.append(
        SlidingWindow(opt.sample_duration, opt.inference_stride))
    temporal_transform = TemporalCompose(temporal_transform)

    inference_data, collate_fn = get_inference_data(opt.inference_label_path, opt.video_id_path,
                                   'test', opt.inference_frame_dir, opt.image_size, window_size=opt.window_size)

    inference_loader = torch.utils.data.DataLoader(
        inference_data,
        batch_size=opt.inference_batch_size,
        shuffle=False,
        num_workers=opt.n_threads,
        pin_memory=False,
        worker_init_fn=worker_init_fn)
        # collate_fn=collate_fn)

    return inference_loader, inference_data.class_names
Esempio n. 5
0
def get_val_utils(opt):
    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)
    spatial_transform = [
        Resize(opt.sample_size),
        CenterCrop(opt.sample_size),
        ToArray(),
    ]
    if opt.input_type == 'flow':
        spatial_transform.append(PickFirstChannels(n=2))
    spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
    spatial_transform = Compose(spatial_transform)

    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
    temporal_transform.append(
        TemporalEvenCrop(opt.sample_duration, opt.n_val_samples))
    temporal_transform = TemporalCompose(temporal_transform)

    val_data = get_validation_data(opt.video_path, opt.annotation_path,
                                   opt.dataset, opt.input_type, opt.file_type,
                                   spatial_transform, temporal_transform)
    val_loader = paddle.batch(val_data.reader, batch_size=opt.batch_size)

    val_logger = Logger(opt.result_path / 'val.log', ['epoch', 'loss', 'acc'])

    return val_loader, val_logger
Esempio n. 6
0
def get_inference_utils(opt):
    assert opt.inference_crop in ['center', 'nocrop']

    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)

    spatial_transform = [Resize(opt.sample_size)]
    if opt.inference_crop == 'center':
        spatial_transform.append(CenterCrop(opt.sample_size))
        spatial_transform.append(ToArray())
    if opt.input_type == 'flow':
        spatial_transform.append(PickFirstChannels(n=2))
    spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
    spatial_transform = Compose(spatial_transform)

    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
    temporal_transform.append(
        SlidingWindow(opt.sample_duration, opt.inference_stride))
    temporal_transform = TemporalCompose(temporal_transform)

    inference_data = get_inference_data(opt.video_path, opt.annotation_path,
                                        opt.dataset, opt.input_type,
                                        opt.file_type, opt.inference_subset,
                                        spatial_transform, temporal_transform)

    inference_loader = paddle.batch(inference_data.reader,
                                    batch_size=opt.inference_batch_size)

    return inference_loader, inference_data.class_names
def get_spatial_transform():
    normalize = get_normalize_method([0.4345, 0.4051, 0.3775],
                                     [0.2768, 0.2713, 0.2737], False, False)
    spatial_transform = [Resize(112)]
    spatial_transform.append(CenterCrop(112))
    spatial_transform.append(ToTensor())
    spatial_transform.extend([ScaleValue(1), normalize])
    spatial_transform = Compose(spatial_transform)
    return spatial_transform
Esempio n. 8
0
def get_spatial_transform(opt):
    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)
    spatial_transform = [Resize(opt.sample_size)]
    if opt.inference_crop == 'center':
        spatial_transform.append(CenterCrop(opt.sample_size))
    spatial_transform.append(ToTensor())
    spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
    spatial_transform = Compose(spatial_transform)
    return spatial_transform
Esempio n. 9
0
def get_inference_utils(opt):
    assert opt.inference_crop in ['center', 'nocrop']

    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)

    spatial_transform = [Resize(opt.sample_size)]
    if opt.inference_crop == 'center':
        spatial_transform.append(CenterCrop(opt.sample_size))
    spatial_transform.append(ToTensor())
    if opt.input_type == 'flow':
        spatial_transform.append(PickFirstChannels(n=2))
    spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
    spatial_transform = Compose(spatial_transform)

    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
    temporal_transform.append(
        SlidingWindow(opt.sample_duration, opt.inference_stride))
    temporal_transform = TemporalCompose(temporal_transform)

    inf_data_checkpoint_path = opt.result_path / Path('inf_data_' +
                                                      opt.dataset + '.data')
    inf_collate_checkpoint_path = opt.result_path / Path('inf_coll_' +
                                                         opt.dataset + '.data')
    if os.path.exists(inf_data_checkpoint_path) and os.path.exists(
            inf_collate_checkpoint_path) and opt.save_load_data_checkpoint:
        with open(inf_data_checkpoint_path, 'rb') as filehandle:
            inference_data = pickle.load(filehandle)
        with open(inf_collate_checkpoint_path, 'rb') as filehandle:
            collate_fn = pickle.load(filehandle)
    else:
        inference_data, collate_fn = get_inference_data(
            opt.video_path, opt.annotation_path, opt.dataset, opt.input_type,
            opt.file_type, opt.inference_subset, spatial_transform,
            temporal_transform)
        if opt.save_load_data_checkpoint:
            with open(inf_data_checkpoint_path, 'wb') as filehandle:
                pickle.dump(inference_data, filehandle)
            with open(inf_collate_checkpoint_path, 'wb') as filehandle:
                pickle.dump(collate_fn, filehandle)

    inference_loader = torch.utils.data.DataLoader(
        inference_data,
        batch_size=opt.inference_batch_size,
        shuffle=False,
        num_workers=opt.n_threads,
        pin_memory=True,
        worker_init_fn=worker_init_fn,
        collate_fn=collate_fn)

    return inference_loader, inference_data.class_names
Esempio n. 10
0
def compute_saliency_maps(model, opt):
    # Generate tiny data loader
    # Loop through it to generate saliency maps
    assert opt.inference_crop in ['center', 'nocrop']

    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)

    spatial_transform = [Resize(opt.sample_size)]
    if opt.inference_crop == 'center':
        spatial_transform.append(CenterCrop(opt.sample_size))
    spatial_transform.append(ToTensor())
    if opt.input_type == 'flow':
        spatial_transform.append(PickFirstChannels(n=2))
    spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
    spatial_transform = Compose(spatial_transform)

    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
    temporal_transform.append(
        SlidingWindow(opt.sample_duration, opt.inference_stride))
    temporal_transform = TemporalCompose(temporal_transform)

    tiny_video_path = Path('/home/ruta/teeny_data/nturgb/jpg')
    tiny_annotation_path = Path('/home/ruta/teeny_data/ntu_01.json')
    tiny_data, collate_fn = get_inference_data(
        tiny_video_path, tiny_annotation_path, opt.dataset, opt.input_type,
        opt.file_type, opt.inference_subset, spatial_transform,
        temporal_transform)

    tiny_loader = torch.utils.data.DataLoader(
        tiny_data,
        batch_size=opt.inference_batch_size,
        shuffle=False,
        num_workers=opt.n_threads,
        pin_memory=True,
        sampler=None,
        worker_init_fn=worker_init_fn,
        collate_fn=collate_fn)

    saliency_maps = []
    for i, (inputs, targets) in enumerate(tiny_loader):
        sal_map = get_saliency_map(inputs, targets, model, opt)
        # Plot the saliency map using matplotlib and save to a file
        plot_saliency(sal_map, i, inputs, targets)
        saliency_maps.append(sal_map)

    return saliency_maps
Esempio n. 11
0
def get_inference_utils(opt):
    assert opt.inference_crop in ['center', 'nocrop']

    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)

    spatial_transform = [Resize(opt.sample_size)]
    if opt.inference_crop == 'center':
        spatial_transform.append(CenterCrop(opt.sample_size))
    spatial_transform.append(ToTensor())
    if opt.input_type == 'flow':
        spatial_transform.append(PickFirstChannels(n=2))
    spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
    spatial_transform = Compose(spatial_transform)

    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
    temporal_transform.append(
        SlidingWindow(opt.sample_duration, opt.inference_stride))
    temporal_transform = TemporalCompose(temporal_transform)

    inference_data, collate_fn = get_inference_data(opt.video_path,
                                                    opt.input_type,
                                                    opt.file_type,
                                                    spatial_transform,
                                                    temporal_transform)

    #     inference_data, collate_fn = get_inference_data(
    #         opt.video_path, opt.input_type, opt.file_type,
    #         spatial_transform)

    inference_loader = torch.utils.data.DataLoader(
        inference_data,
        batch_size=opt.inference_batch_size,
        shuffle=False,
        num_workers=opt.n_threads,
        pin_memory=True,
        worker_init_fn=worker_init_fn,
        collate_fn=collate_fn)

    df = pd.read_csv('kinetics_700_labels.csv')
    class_names = {}
    for i in range(df.shape[0]):
        row = df.iloc[i]
        class_names[row[0]] = row[1]

    return inference_loader, class_names
Esempio n. 12
0
def retrieve_spatial_temporal_transforms(opt):
    opt.mean, opt.std = get_mean_std(opt.value_scale, dataset=opt.mean_dataset)
    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)
    spatial_transform = [Resize(opt.sample_size),
                         CenterCrop(opt.sample_size),
                         ToTensor(),
                         ScaleValue(opt.value_scale),
                         normalize]
    spatial_transform = Compose(spatial_transform)
    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
    temporal_transform.append(
        TemporalNonOverlappingWindow(opt.sample_duration))
    temporal_transform = TemporalCompose(temporal_transform)
    return spatial_transform, temporal_transform
Esempio n. 13
0
def get_val_utils(opt):
    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)
    spatial_transform = [
        Resize(opt.sample_size),
        CenterCrop(opt.sample_size),
        ToTensor()
    ]
    if opt.input_type == 'flow':
        spatial_transform.append(PickFirstChannels(n=2))
    spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
    spatial_transform = Compose(spatial_transform)

    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
    temporal_transform.append(
        TemporalEvenCrop(opt.sample_duration, opt.n_val_samples))
    temporal_transform = TemporalCompose(temporal_transform)

    val_data, collate_fn = get_validation_data(opt.label_path, opt.video_id_path,
                                   'val', opt.frame_dir, opt.image_size, window_size=opt.window_size)

    if opt.distributed:
        val_sampler = torch.utils.data.distributed.DistributedSampler(
            val_data, shuffle=False)
    else:
        val_sampler = None
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=(opt.batch_size //
                                                         opt.n_val_samples),
                                             shuffle=False,
                                             num_workers=opt.n_threads,
                                             pin_memory=False,
                                             sampler=val_sampler,
                                             worker_init_fn=worker_init_fn)
                                             # collate_fn=collate_fn)

    if opt.is_master_node:
        val_logger = Logger(opt.result_path / 'val.log',
                            ['epoch', 'loss', 'acc', 'precision', 'recall', 'f1', 'tiou'])
    else:
        val_logger = None

    return val_loader, val_logger
Esempio n. 14
0
def get_train_utils(opt, model_parameters):
    assert opt.train_crop in ['random', 'corner', 'center']
    spatial_transform = []
    if opt.train_crop == 'random':
        spatial_transform.append(
            RandomResizedCrop(
                opt.sample_size, (opt.train_crop_min_scale, 1.0),
                (opt.train_crop_min_ratio, 1.0 / opt.train_crop_min_ratio)))
    elif opt.train_crop == 'corner':
        scales = [1.0]
        scale_step = 1 / (2**(1 / 4))
        for _ in range(1, 5):
            scales.append(scales[-1] * scale_step)
        spatial_transform.append(MultiScaleCornerCrop(opt.sample_size, scales))
    elif opt.train_crop == 'center':
        spatial_transform.append(Resize(opt.sample_size))
        spatial_transform.append(CenterCrop(opt.sample_size))
    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)
    if not opt.no_hflip:
        spatial_transform.append(RandomHorizontalFlip())
    if opt.colorjitter:
        spatial_transform.append(ColorJitter())
    spatial_transform.append(ToTensor())
    if opt.input_type == 'flow':
        spatial_transform.append(PickFirstChannels(n=2))
    spatial_transform.append(ScaleValue(opt.value_scale))
    spatial_transform.append(normalize)
    spatial_transform = Compose(spatial_transform)

    assert opt.train_t_crop in ['random', 'center']
    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
    if opt.train_t_crop == 'random':
        temporal_transform.append(TemporalRandomCrop(opt.sample_duration))
    elif opt.train_t_crop == 'center':
        temporal_transform.append(TemporalCenterCrop(opt.sample_duration))
    temporal_transform = TemporalCompose(temporal_transform)

    train_data = get_training_data(opt.video_path, opt.annotation_path,
                                   opt.dataset, opt.input_type, opt.file_type,
                                   spatial_transform, temporal_transform)
    if opt.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_data)
    else:
        train_sampler = None
    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=opt.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=opt.n_threads,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               worker_init_fn=worker_init_fn)

    if opt.is_master_node:
        train_logger = Logger(opt.result_path / 'train.log',
                              ['epoch', 'loss', 'acc', 'lr'])
        train_batch_logger = Logger(
            opt.result_path / 'train_batch.log',
            ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr'])
    else:
        train_logger = None
        train_batch_logger = None

    if opt.nesterov:
        dampening = 0
    else:
        dampening = opt.dampening
    optimizer = SGD(model_parameters,
                    lr=opt.learning_rate,
                    momentum=opt.momentum,
                    dampening=dampening,
                    weight_decay=opt.weight_decay,
                    nesterov=opt.nesterov)

    assert opt.lr_scheduler in ['plateau', 'multistep']
    assert not (opt.lr_scheduler == 'plateau' and opt.no_val)
    if opt.lr_scheduler == 'plateau':
        scheduler = lr_scheduler.ReduceLROnPlateau(
            optimizer, 'min', patience=opt.plateau_patience)
    else:
        scheduler = lr_scheduler.MultiStepLR(optimizer,
                                             opt.multistep_milestones)

    return (train_loader, train_sampler, train_logger, train_batch_logger,
            optimizer, scheduler)
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser(description="Run model against images")
    parser.add_argument(
        '--input-glob',
        default=
        'data/kinetics_videos/jpg/yoga/0wHOYxjRmlw_000041_000051/image_000{41,42,43,44,45,46,47,48,49,50,41,42,43,44,45,46}.jpg',
        help="inputs")
    parser.add_argument("--depth", default="50", help="which model depth")
    args = parser.parse_args()

    model_file = model_files[args.depth]
    model_depth = int(args.depth)

    model = resnet.generate_model(model_depth=model_depth,
                                  n_classes=700,
                                  n_input_channels=3,
                                  shortcut_type="B",
                                  conv1_t_size=7,
                                  conv1_t_stride=1,
                                  no_max_pool=False,
                                  widen_factor=1.0)

    # model = load_pretrained_model(model, args.model, "resnet", 700)

    checkpoint = torch.load(model_file, map_location='cpu')
    arch = '{}-{}'.format("resnet", model_depth)
    print(arch, checkpoint['arch'])
    assert arch == checkpoint['arch']

    if hasattr(model, 'module'):
        # I think this only for legacy models
        model.module.load_state_dict(checkpoint['state_dict'])
    else:
        model.load_state_dict(checkpoint['state_dict'])

    model.eval()

    image_clips = []
    files = real_glob(args.input_glob)
    files = extend_to_length(files, 16)
    print(files)
    for f in files:
        img = Image.open(f).convert("RGB")
        image_clips.append(img)

    # print("EARLY", image_clips[0][0:4,0:4,0])

    mean = [0.4345, 0.4051, 0.3775]
    std = [0.2768, 0.2713, 0.2737]
    normalize = Normalize(mean, std)

    sample_size = 112

    spatial_transform = [Resize(sample_size)]
    spatial_transform.append(CenterCrop(sample_size))
    spatial_transform.append(ToTensor())
    spatial_transform.extend([ScaleValue(1), normalize])
    spatial_transform = Compose(spatial_transform)

    # c = spatial_transform(image_clips[0])
    # c.save("raw.png")

    model_clips = []
    clip = [spatial_transform(img) for img in image_clips]
    model_clips.append(torch.stack(clip, 0).permute(1, 0, 2, 3))
    model_clips = torch.stack(model_clips, 0)

    print("Final", model_clips.shape)
    print("PEEK", model_clips[0, 0, 0, 0:4, 0:4])

    with torch.no_grad():
        outputs = model(model_clips)
        print(outputs[0][0:10])
        outputs = F.softmax(outputs, dim=1).cpu()

    sorted_scores, locs = torch.topk(outputs[0], k=3)

    print(locs[0])

    video_results = []
    for i in range(sorted_scores.size(0)):
        video_results.append({
            'label': magic_labels_700[locs[i].item()],
            'score': sorted_scores[i].item()
        })

    print(video_results)
'''
inference_crop = 'center'
mean = [0.4345, 0.4051, 0.3775]
std = [0.2768, 0.2713, 0.2737]
no_mean_norm = False
no_std_norm = False
sample_size = 112
value_scale = 1
input_type = 'rgb'
sample_t_stride = 1
sample_duration = 16
inference_stride = 16

#normalize = get_normalize_method(mean, std, no_mean_norm, no_std_norm)
normalize = Normalize(mean, std)
spatial_transform = [Resize(sample_size)]
if inference_crop == 'center':
    spatial_transform.append(CenterCrop(sample_size))
if input_type == 'flow':
    spatial_transform.append(PickFirstChannels(n=2))
spatial_transform.append(ToTensor())
spatial_transform.extend([ScaleValue(value_scale), normalize])
spatial_transform = Compose(spatial_transform)

temporal_transform = []
if sample_t_stride > 1:
    temporal_transform.append(TemporalSubsampling(sample_t_stride))
temporal_transform.append(SlidingWindow(sample_duration, inference_stride))
temporal_transform = TemporalCompose(temporal_transform)

# 加载模型
Esempio n. 17
0
def main():
    global args
    global best_prec1
    args = parser.parse_args()

    print('Training arguments:')
    for k, v in vars(args).items():
        print('\t{}: {}'.format(k, v))

    if args.data_name == 'ucf101':
        num_class = 101
    elif args.data_name == 'hmdb51':
        num_class = 51
    elif args.data_name == 'mine':
        num_class = 2
    else:
        raise ValueError('Unknown dataset ' + args.data_name)

    model = Model(num_class,
                  args.num_segments,
                  args.representation,
                  base_model=args.arch)
    print(model)

    if 'resnet3D' in args.arch:
        train_crop_min_ratio = 0.75
        train_crop_min_scale = 0.25
        mean = [0.4345, 0.4051, 0.3775]
        std = [0.2768, 0.2713, 0.2737]
        value_scale = 1

        train_transform = Compose([
            RandomResizedCrop(
                model.crop_size, (train_crop_min_scale, 1.0),
                (train_crop_min_ratio, 1.0 / train_crop_min_ratio)),
            RandomHorizontalFlip(),
            ToTensor(),
            ScaleValue(value_scale),
            Normalize(mean, std)
        ])
        test_trainsform = Compose([
            Resize(model.crop_size),
            CenterCrop(model.crop_size),
            ToTensor(),  # range [0, 255] -> [0.0,1.0]
            ScaleValue(1),
            Normalize(mean, std)
        ])

    train_loader = torch.utils.data.DataLoader(
        CoviarDataSet(
            args.data_root,
            args.data_name,
            video_list=args.train_list,
            num_segments=args.num_segments,
            representation=args.representation,
            transform=model.get_augmentation(),  #train_transform, 
            is_train=True,
            accumulate=(not args.no_accumulation),
            model_name=args.arch),
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=True,
        worker_init_fn=worker_init_fn)

    val_loader = torch.utils.data.DataLoader(
        CoviarDataSet(
            args.data_root,
            args.data_name,
            video_list=args.test_list,
            num_segments=args.num_segments,
            representation=args.representation,
            transform=torchvision.transforms.Compose([
                GroupScale(int(model.scale_size)),
                GroupCenterCrop(model.crop_size)
            ]),  #test_trainsform, 
            is_train=True,
            accumulate=(not args.no_accumulation),
            model_name=args.arch),
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True,
        worker_init_fn=worker_init_fn)

    model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda()
    cudnn.benchmark = True

    params_dict = dict(model.named_parameters())
    params = []
    for key, value in params_dict.items():
        decay_mult = 0.0 if 'bias' in key else 1.0

        if ('module.base_model.conv1' in key or 'module.base_model.bn1' in key
                or 'data_bn'
                in key) and args.representation in ['mv', 'residual']:
            lr_mult = 0.1
        elif '.fc.' in key:
            lr_mult = 1.0
        else:
            lr_mult = 0.01

        params += [{
            'params': value,
            'lr': args.lr,
            'lr_mult': lr_mult,
            'decay_mult': decay_mult
        }]

    #optimizer = torch.optim.SGD(params, weight_decay=0.001, momentum=0.9, nesterov=False)
    #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10)
    optimizer = torch.optim.Adam(params,
                                 weight_decay=args.weight_decay,
                                 eps=0.001)
    criterion = torch.nn.CrossEntropyLoss().cuda()

    for epoch in range(args.epochs):
        cur_lr = adjust_learning_rate(optimizer, epoch, args.lr_steps,
                                      args.lr_decay)
        #cur_lr = get_lr(optimizer)

        train(train_loader, model, criterion, optimizer, epoch, cur_lr)
        #prec1, prev_val_loss = validate(val_loader, model, criterion)
        #scheduler.step(prev_val_loss)

        if epoch % args.eval_freq == 0 or epoch == args.epochs - 1:
            prec1, _ = validate(val_loader, model, criterion)

            # 紀錄訓練歷程
            np.savez("train_history/train_history.npz",
                     loss=np.array(train_loss),
                     top1=np.array(train_prec),
                     lr=np.array(train_lr))
            np.savez("train_history/valid_history.npz",
                     loss=np.array(valid_loss),
                     top1=np.array(valid_prec))

            is_best = prec1 > best_prec1
            best_prec1 = max(prec1, best_prec1)
            if is_best or epoch % SAVE_FREQ == 0:
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'arch': args.arch,
                        'state_dict': model.state_dict(),
                        'best_prec1': best_prec1,
                    },
                    is_best,
                    filename='checkpoint.pth.tar')
model.fc = fc
print('Model Loaded...')

model.load_state_dict(torch.load("./outputs/fight_reco_3DCNNmodel.pth"))
print('Loaded model state_dict...')

device = torch.device('cuda:0')
model.to(device)

value_scale = 1
mean = [0.4345, 0.4051, 0.3775]
std = [0.2768, 0.2713, 0.2737]

sample_size = 112 # resolution of frame

spatial_transform =  Compose([Resize(sample_size),
                                        CenterCrop(sample_size),
                                        ToTensor(),
                                        ScaleValue(value_scale),
                                        Normalize(mean, std)])

VIDEO_PATH = "./input/test_data/fi038.mp4"

cap = cv2.VideoCapture(VIDEO_PATH)

if (cap.isOpened() == False):
    print('Error while trying to read video. Plese check again...')

# get the frame width and height
frame_width = int(cap.get(3))
frame_height = int(cap.get(4))
Esempio n. 19
0
def get_val_utils(opt):
    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)
    spatial_transform = [
        Resize(opt.sample_size),
        CenterCrop(opt.sample_size),
        ToTensor()
    ]
    if opt.input_type == 'flow':
        spatial_transform.append(PickFirstChannels(n=2))
    spatial_transform.extend([ScaleValue(opt.value_scale), normalize])
    spatial_transform = Compose(spatial_transform)

    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
    temporal_transform.append(
        TemporalEvenCrop(opt.sample_duration, opt.n_val_samples))
    temporal_transform = TemporalCompose(temporal_transform)

    val_data_checkpoint_path = opt.result_path / Path('val_data_' +
                                                      opt.dataset + '.data')
    val_collate_checkpoint_path = opt.result_path / Path('val_coll_' +
                                                         opt.dataset + '.data')
    if os.path.exists(val_data_checkpoint_path) and os.path.exists(
            val_collate_checkpoint_path) and opt.save_load_data_checkpoint:
        with open(val_data_checkpoint_path, 'rb') as filehandle:
            val_data = pickle.load(filehandle)
        with open(val_collate_checkpoint_path, 'rb') as filehandle:
            collate_fn = pickle.load(filehandle)
    else:
        val_data, collate_fn = get_validation_data(
            opt.video_path, opt.annotation_path, opt.dataset, opt.input_type,
            opt.file_type, spatial_transform, temporal_transform)
        if opt.save_load_data_checkpoint:
            with open(val_data_checkpoint_path, 'wb') as filehandle:
                pickle.dump(val_data, filehandle)
            with open(val_collate_checkpoint_path, 'wb') as filehandle:
                pickle.dump(collate_fn, filehandle)

    if opt.distributed:
        val_sampler = torch.utils.data.distributed.DistributedSampler(
            val_data, shuffle=False)
    else:
        val_sampler = None
    val_loader = torch.utils.data.DataLoader(val_data,
                                             batch_size=(opt.batch_size //
                                                         opt.n_val_samples),
                                             shuffle=False,
                                             num_workers=opt.n_threads,
                                             pin_memory=True,
                                             sampler=val_sampler,
                                             worker_init_fn=worker_init_fn,
                                             collate_fn=collate_fn)

    if opt.is_master_node:
        val_logger = Logger(opt.result_path / 'val.log',
                            ['epoch', 'loss', 'acc'])
    else:
        val_logger = None

    return val_loader, val_logger
Esempio n. 20
0
def get_train_utils(opt, model_parameters):
    assert opt.train_crop in ['random', 'corner', 'center']
    spatial_transform = []
    if opt.train_crop == 'random':
        spatial_transform.append(
            RandomResizedCrop(
                opt.sample_size, (opt.train_crop_min_scale, 1.0),
                (opt.train_crop_min_ratio, 1.0 / opt.train_crop_min_ratio)))
    elif opt.train_crop == 'corner':
        scales = [1.0]
        scale_step = 1 / (2**(1 / 4))
        for _ in range(1, 5):
            scales.append(scales[-1] * scale_step)
        spatial_transform.append(MultiScaleCornerCrop(opt.sample_size, scales))
    elif opt.train_crop == 'center':
        spatial_transform.append(Resize(opt.sample_size))
        spatial_transform.append(CenterCrop(opt.sample_size))
    normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm,
                                     opt.no_std_norm)
    if not opt.no_hflip:
        spatial_transform.append(RandomHorizontalFlip())
    spatial_transform.append(ToArray())
    if opt.colorjitter:
        spatial_transform.append(ColorJitter())
    if opt.input_type == 'flow':
        spatial_transform.append(PickFirstChannels(n=2))
    spatial_transform.append(ScaleValue(opt.value_scale))
    spatial_transform.append(normalize)
    spatial_transform = Compose(spatial_transform)

    assert opt.train_t_crop in ['random', 'center']
    temporal_transform = []
    if opt.sample_t_stride > 1:
        temporal_transform.append(TemporalSubsampling(opt.sample_t_stride))
    if opt.train_t_crop == 'random':
        temporal_transform.append(TemporalRandomCrop(opt.sample_duration))
    elif opt.train_t_crop == 'center':
        temporal_transform.append(TemporalCenterCrop(opt.sample_duration))
    temporal_transform = TemporalCompose(temporal_transform)

    train_data = get_training_data(opt.video_path, opt.annotation_path,
                                   opt.dataset, opt.input_type, opt.file_type,
                                   spatial_transform, temporal_transform)
    train_loader = paddle.batch(train_data.reader, batch_size=opt.batch_size)

    train_logger = Logger(opt.result_path / 'train.log',
                          ['epoch', 'loss', 'acc', 'lr'])
    train_batch_logger = Logger(
        opt.result_path / 'train_batch.log',
        ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr'])

    assert opt.lr_scheduler in ['plateau', 'multistep']
    assert not (opt.lr_scheduler == 'plateau' and opt.no_val)
    if opt.lr_scheduler == 'plateau':
        scheduler = ReduceLROnPlateau(learning_rate=opt.learning_rate,
                                      mode='min',
                                      patience=opt.plateau_patience)
    else:
        scheduler = MultiStepDecay(learning_rate=opt.learning_rate,
                                   milestones=opt.multistep_milestones)

    optimizer = fluid.optimizer.MomentumOptimizer(
        learning_rate=scheduler,
        momentum=opt.momentum,
        parameter_list=model_parameters,
        use_nesterov=opt.nesterov,
        regularization=fluid.regularizer.L2Decay(
            regularization_coeff=opt.weight_decay))

    return (train_loader, train_logger, train_batch_logger, optimizer,
            scheduler)
    def score(self):

        normalize = get_normalize_method(self.opt.mean, self.opt.std, self.opt.no_mean_norm,
                                         self.opt.no_std_norm)
        spatial_transform = [
            Resize(self.opt.sample_size),
            CenterCrop(self.opt.sample_size),
            ToTensor()
        ]

        spatial_transform.extend([ScaleValue(self.opt.value_scale), normalize])
        spatial_transform = Compose(spatial_transform)

        temporal_transform = []
        if self.opt.sample_t_stride > 1:
            temporal_transform.append(TemporalSubsampling(self.opt.sample_t_stride))
        temporal_transform.append(
            TemporalEvenCrop(self.opt.sample_duration, self.opt.n_val_samples))
        temporal_transform = TemporalCompose(temporal_transform)


        frame_count = get_n_frames(self.opt.video_jpgs_dir_path)

        frame_indices = list(range(0, frame_count))

        frame_indices = temporal_transform(frame_indices)

        spatial_transform.randomize_parameters()

        image_name_formatter = lambda x: f'image_{x:05d}.jpg'

        loader = VideoLoader(image_name_formatter)

        print('frame_indices', frame_indices)

        #clips = []
        video_outputs = []
        model = generate_model(self.opt)


        model = load_pretrained_model(model, self.opt.pretrain_path, self.opt.model,
                                      self.opt.n_finetune_classes)

        i =0
        for frame_indice in frame_indices:
            print("%d indice: %s" % (i, str(frame_indice)))
            i+=1

            clip = loader(self.opt.video_jpgs_dir_path, frame_indice)



            clip = [spatial_transform(img) for img in clip]
            clip = torch.stack(clip, 0).permute(1, 0, 2, 3)





            #parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)


            #print('clips:', clips)


            #for clip in clips:
            with torch.no_grad():

                print(clip.shape)
                output = model(torch.unsqueeze(clip, 0))
                output = F.softmax(output, dim=1).cpu()

                #print(output)
                video_outputs.append(output[0])

            del clip

        video_outputs = torch.stack(video_outputs)
        average_scores = torch.mean(video_outputs, dim=0)

        #inference_loader, inference_class_names = main.get_inference_utils(self.opt)
        with self.opt.annotation_path.open('r') as f:
            data = json.load(f)

        class_to_idx = get_class_labels(data)
        idx_to_class = {}
        for name, label in class_to_idx.items():
            idx_to_class[label] = name
        print(idx_to_class)

        inference_result = inference.get_video_results(
            average_scores, idx_to_class, self.opt.output_topk)

        print(inference_result)