def get_val_utils(opt): normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) if opt.train_crop == 'other': spatial_transform = [ Resize((opt.scale_h, opt.scale_w)), RandomCrop(opt.sample_size), ToTensor() ] else: spatial_transform = [ Resize(opt.sample_size), CenterCrop(opt.sample_size), ToTensor() ] if opt.input_type == 'flow': spatial_transform.append(PickFirstChannels(n=2)) spatial_transform.extend([ScaleValue(opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) temporal_transform.append( TemporalEvenCrop(opt.sample_duration, opt.n_val_samples)) temporal_transform = TemporalCompose(temporal_transform) val_data, collate_fn = get_validation_data( opt.video_path, opt.annotation_path, opt.dataset, opt.input_type, opt.file_type, spatial_transform, temporal_transform) if opt.distributed: val_sampler = torch.utils.data.distributed.DistributedSampler( val_data, shuffle=False) else: val_sampler = None val_loader = torch.utils.data.DataLoader(val_data, batch_size=(opt.batch_size // opt.n_val_samples), shuffle=False, num_workers=opt.n_threads, pin_memory=True, sampler=val_sampler, worker_init_fn=worker_init_fn, collate_fn=collate_fn) if opt.is_master_node: val_logger = Logger(opt.result_path / 'val.log', ['epoch', 'loss', 'acc', 'acc_num']) else: val_logger = None return val_loader, val_logger
def get_loaders(opt): """ Make dataloaders for train and validation sets """ # train loader norm_method = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) spatial_transform = Compose([ Scale((opt.sample_size, opt.sample_size)), Resize(256), CenterCrop(224), ToTensor(), norm_method ]) temporal_transform = TemporalRandomCrop(25) target_transform = ClassLabel() training_data = get_training_set(opt, spatial_transform, temporal_transform, target_transform) train_loader = torch.utils.data.DataLoader(training_data, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, pin_memory=True) # validation loader target_transform = ClassLabel() temporal_transform = LoopPadding(25) validation_data = get_validation_set(opt, spatial_transform, temporal_transform, target_transform) val_loader = torch.utils.data.DataLoader(validation_data, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers, pin_memory=True) return train_loader, val_loader
def get_inference_utils(opt): assert opt.inference_crop in ['center', 'nocrop'] normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) spatial_transform = [Resize(opt.sample_size)] if opt.inference_crop == 'center': spatial_transform.append(CenterCrop(opt.sample_size)) spatial_transform.extend( [ToTensor(), ScaleValue(opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) temporal_transform.append( SlidingWindow(opt.sample_duration, opt.inference_stride)) temporal_transform = TemporalCompose(temporal_transform) inference_data, collate_fn = get_inference_data( opt.video_path, opt.annotation_path, opt.dataset, opt.file_type, opt.inference_subset, spatial_transform, temporal_transform) inference_loader = torch.utils.data.DataLoader( inference_data, batch_size=opt.inference_batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True, worker_init_fn=worker_init_fn, collate_fn=collate_fn) return inference_loader, inference_data.class_names
def get_inference_utils(opt): assert opt.inference_crop in ['center', 'nocrop'] normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) spatial_transform = [Resize(opt.sample_size)] if opt.inference_crop == 'center': spatial_transform.append(CenterCrop(opt.sample_size)) spatial_transform.append(ToTensor()) if opt.input_type == 'flow': spatial_transform.append(PickFirstChannels(n=2)) spatial_transform.extend([ScaleValue(opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) temporal_transform.append( SlidingWindow(opt.sample_duration, opt.inference_stride)) temporal_transform = TemporalCompose(temporal_transform) inference_data, collate_fn = get_inference_data(opt.inference_label_path, opt.video_id_path, 'test', opt.inference_frame_dir, opt.image_size, window_size=opt.window_size) inference_loader = torch.utils.data.DataLoader( inference_data, batch_size=opt.inference_batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=False, worker_init_fn=worker_init_fn) # collate_fn=collate_fn) return inference_loader, inference_data.class_names
def get_val_utils(opt): normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) spatial_transform = [ Resize(opt.sample_size), CenterCrop(opt.sample_size), ToArray(), ] if opt.input_type == 'flow': spatial_transform.append(PickFirstChannels(n=2)) spatial_transform.extend([ScaleValue(opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) temporal_transform.append( TemporalEvenCrop(opt.sample_duration, opt.n_val_samples)) temporal_transform = TemporalCompose(temporal_transform) val_data = get_validation_data(opt.video_path, opt.annotation_path, opt.dataset, opt.input_type, opt.file_type, spatial_transform, temporal_transform) val_loader = paddle.batch(val_data.reader, batch_size=opt.batch_size) val_logger = Logger(opt.result_path / 'val.log', ['epoch', 'loss', 'acc']) return val_loader, val_logger
def get_inference_utils(opt): assert opt.inference_crop in ['center', 'nocrop'] normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) spatial_transform = [Resize(opt.sample_size)] if opt.inference_crop == 'center': spatial_transform.append(CenterCrop(opt.sample_size)) spatial_transform.append(ToArray()) if opt.input_type == 'flow': spatial_transform.append(PickFirstChannels(n=2)) spatial_transform.extend([ScaleValue(opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) temporal_transform.append( SlidingWindow(opt.sample_duration, opt.inference_stride)) temporal_transform = TemporalCompose(temporal_transform) inference_data = get_inference_data(opt.video_path, opt.annotation_path, opt.dataset, opt.input_type, opt.file_type, opt.inference_subset, spatial_transform, temporal_transform) inference_loader = paddle.batch(inference_data.reader, batch_size=opt.inference_batch_size) return inference_loader, inference_data.class_names
def get_spatial_transform(): normalize = get_normalize_method([0.4345, 0.4051, 0.3775], [0.2768, 0.2713, 0.2737], False, False) spatial_transform = [Resize(112)] spatial_transform.append(CenterCrop(112)) spatial_transform.append(ToTensor()) spatial_transform.extend([ScaleValue(1), normalize]) spatial_transform = Compose(spatial_transform) return spatial_transform
def get_spatial_transform(opt): normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) spatial_transform = [Resize(opt.sample_size)] if opt.inference_crop == 'center': spatial_transform.append(CenterCrop(opt.sample_size)) spatial_transform.append(ToTensor()) spatial_transform.extend([ScaleValue(opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) return spatial_transform
def get_inference_utils(opt): assert opt.inference_crop in ['center', 'nocrop'] normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) spatial_transform = [Resize(opt.sample_size)] if opt.inference_crop == 'center': spatial_transform.append(CenterCrop(opt.sample_size)) spatial_transform.append(ToTensor()) if opt.input_type == 'flow': spatial_transform.append(PickFirstChannels(n=2)) spatial_transform.extend([ScaleValue(opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) temporal_transform.append( SlidingWindow(opt.sample_duration, opt.inference_stride)) temporal_transform = TemporalCompose(temporal_transform) inf_data_checkpoint_path = opt.result_path / Path('inf_data_' + opt.dataset + '.data') inf_collate_checkpoint_path = opt.result_path / Path('inf_coll_' + opt.dataset + '.data') if os.path.exists(inf_data_checkpoint_path) and os.path.exists( inf_collate_checkpoint_path) and opt.save_load_data_checkpoint: with open(inf_data_checkpoint_path, 'rb') as filehandle: inference_data = pickle.load(filehandle) with open(inf_collate_checkpoint_path, 'rb') as filehandle: collate_fn = pickle.load(filehandle) else: inference_data, collate_fn = get_inference_data( opt.video_path, opt.annotation_path, opt.dataset, opt.input_type, opt.file_type, opt.inference_subset, spatial_transform, temporal_transform) if opt.save_load_data_checkpoint: with open(inf_data_checkpoint_path, 'wb') as filehandle: pickle.dump(inference_data, filehandle) with open(inf_collate_checkpoint_path, 'wb') as filehandle: pickle.dump(collate_fn, filehandle) inference_loader = torch.utils.data.DataLoader( inference_data, batch_size=opt.inference_batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True, worker_init_fn=worker_init_fn, collate_fn=collate_fn) return inference_loader, inference_data.class_names
def compute_saliency_maps(model, opt): # Generate tiny data loader # Loop through it to generate saliency maps assert opt.inference_crop in ['center', 'nocrop'] normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) spatial_transform = [Resize(opt.sample_size)] if opt.inference_crop == 'center': spatial_transform.append(CenterCrop(opt.sample_size)) spatial_transform.append(ToTensor()) if opt.input_type == 'flow': spatial_transform.append(PickFirstChannels(n=2)) spatial_transform.extend([ScaleValue(opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) temporal_transform.append( SlidingWindow(opt.sample_duration, opt.inference_stride)) temporal_transform = TemporalCompose(temporal_transform) tiny_video_path = Path('/home/ruta/teeny_data/nturgb/jpg') tiny_annotation_path = Path('/home/ruta/teeny_data/ntu_01.json') tiny_data, collate_fn = get_inference_data( tiny_video_path, tiny_annotation_path, opt.dataset, opt.input_type, opt.file_type, opt.inference_subset, spatial_transform, temporal_transform) tiny_loader = torch.utils.data.DataLoader( tiny_data, batch_size=opt.inference_batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True, sampler=None, worker_init_fn=worker_init_fn, collate_fn=collate_fn) saliency_maps = [] for i, (inputs, targets) in enumerate(tiny_loader): sal_map = get_saliency_map(inputs, targets, model, opt) # Plot the saliency map using matplotlib and save to a file plot_saliency(sal_map, i, inputs, targets) saliency_maps.append(sal_map) return saliency_maps
def get_inference_utils(opt): assert opt.inference_crop in ['center', 'nocrop'] normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) spatial_transform = [Resize(opt.sample_size)] if opt.inference_crop == 'center': spatial_transform.append(CenterCrop(opt.sample_size)) spatial_transform.append(ToTensor()) if opt.input_type == 'flow': spatial_transform.append(PickFirstChannels(n=2)) spatial_transform.extend([ScaleValue(opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) temporal_transform.append( SlidingWindow(opt.sample_duration, opt.inference_stride)) temporal_transform = TemporalCompose(temporal_transform) inference_data, collate_fn = get_inference_data(opt.video_path, opt.input_type, opt.file_type, spatial_transform, temporal_transform) # inference_data, collate_fn = get_inference_data( # opt.video_path, opt.input_type, opt.file_type, # spatial_transform) inference_loader = torch.utils.data.DataLoader( inference_data, batch_size=opt.inference_batch_size, shuffle=False, num_workers=opt.n_threads, pin_memory=True, worker_init_fn=worker_init_fn, collate_fn=collate_fn) df = pd.read_csv('kinetics_700_labels.csv') class_names = {} for i in range(df.shape[0]): row = df.iloc[i] class_names[row[0]] = row[1] return inference_loader, class_names
def retrieve_spatial_temporal_transforms(opt): opt.mean, opt.std = get_mean_std(opt.value_scale, dataset=opt.mean_dataset) normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) spatial_transform = [Resize(opt.sample_size), CenterCrop(opt.sample_size), ToTensor(), ScaleValue(opt.value_scale), normalize] spatial_transform = Compose(spatial_transform) temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) temporal_transform.append( TemporalNonOverlappingWindow(opt.sample_duration)) temporal_transform = TemporalCompose(temporal_transform) return spatial_transform, temporal_transform
def get_val_utils(opt): normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) spatial_transform = [ Resize(opt.sample_size), CenterCrop(opt.sample_size), ToTensor() ] if opt.input_type == 'flow': spatial_transform.append(PickFirstChannels(n=2)) spatial_transform.extend([ScaleValue(opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) temporal_transform.append( TemporalEvenCrop(opt.sample_duration, opt.n_val_samples)) temporal_transform = TemporalCompose(temporal_transform) val_data, collate_fn = get_validation_data(opt.label_path, opt.video_id_path, 'val', opt.frame_dir, opt.image_size, window_size=opt.window_size) if opt.distributed: val_sampler = torch.utils.data.distributed.DistributedSampler( val_data, shuffle=False) else: val_sampler = None val_loader = torch.utils.data.DataLoader(val_data, batch_size=(opt.batch_size // opt.n_val_samples), shuffle=False, num_workers=opt.n_threads, pin_memory=False, sampler=val_sampler, worker_init_fn=worker_init_fn) # collate_fn=collate_fn) if opt.is_master_node: val_logger = Logger(opt.result_path / 'val.log', ['epoch', 'loss', 'acc', 'precision', 'recall', 'f1', 'tiou']) else: val_logger = None return val_loader, val_logger
def get_train_utils(opt, model_parameters): assert opt.train_crop in ['random', 'corner', 'center'] spatial_transform = [] if opt.train_crop == 'random': spatial_transform.append( RandomResizedCrop( opt.sample_size, (opt.train_crop_min_scale, 1.0), (opt.train_crop_min_ratio, 1.0 / opt.train_crop_min_ratio))) elif opt.train_crop == 'corner': scales = [1.0] scale_step = 1 / (2**(1 / 4)) for _ in range(1, 5): scales.append(scales[-1] * scale_step) spatial_transform.append(MultiScaleCornerCrop(opt.sample_size, scales)) elif opt.train_crop == 'center': spatial_transform.append(Resize(opt.sample_size)) spatial_transform.append(CenterCrop(opt.sample_size)) normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) if not opt.no_hflip: spatial_transform.append(RandomHorizontalFlip()) if opt.colorjitter: spatial_transform.append(ColorJitter()) spatial_transform.append(ToTensor()) if opt.input_type == 'flow': spatial_transform.append(PickFirstChannels(n=2)) spatial_transform.append(ScaleValue(opt.value_scale)) spatial_transform.append(normalize) spatial_transform = Compose(spatial_transform) assert opt.train_t_crop in ['random', 'center'] temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) if opt.train_t_crop == 'random': temporal_transform.append(TemporalRandomCrop(opt.sample_duration)) elif opt.train_t_crop == 'center': temporal_transform.append(TemporalCenterCrop(opt.sample_duration)) temporal_transform = TemporalCompose(temporal_transform) train_data = get_training_data(opt.video_path, opt.annotation_path, opt.dataset, opt.input_type, opt.file_type, spatial_transform, temporal_transform) if opt.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_data) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_data, batch_size=opt.batch_size, shuffle=(train_sampler is None), num_workers=opt.n_threads, pin_memory=True, sampler=train_sampler, worker_init_fn=worker_init_fn) if opt.is_master_node: train_logger = Logger(opt.result_path / 'train.log', ['epoch', 'loss', 'acc', 'lr']) train_batch_logger = Logger( opt.result_path / 'train_batch.log', ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr']) else: train_logger = None train_batch_logger = None if opt.nesterov: dampening = 0 else: dampening = opt.dampening optimizer = SGD(model_parameters, lr=opt.learning_rate, momentum=opt.momentum, dampening=dampening, weight_decay=opt.weight_decay, nesterov=opt.nesterov) assert opt.lr_scheduler in ['plateau', 'multistep'] assert not (opt.lr_scheduler == 'plateau' and opt.no_val) if opt.lr_scheduler == 'plateau': scheduler = lr_scheduler.ReduceLROnPlateau( optimizer, 'min', patience=opt.plateau_patience) else: scheduler = lr_scheduler.MultiStepLR(optimizer, opt.multistep_milestones) return (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler)
def main(): parser = argparse.ArgumentParser(description="Run model against images") parser.add_argument( '--input-glob', default= 'data/kinetics_videos/jpg/yoga/0wHOYxjRmlw_000041_000051/image_000{41,42,43,44,45,46,47,48,49,50,41,42,43,44,45,46}.jpg', help="inputs") parser.add_argument("--depth", default="50", help="which model depth") args = parser.parse_args() model_file = model_files[args.depth] model_depth = int(args.depth) model = resnet.generate_model(model_depth=model_depth, n_classes=700, n_input_channels=3, shortcut_type="B", conv1_t_size=7, conv1_t_stride=1, no_max_pool=False, widen_factor=1.0) # model = load_pretrained_model(model, args.model, "resnet", 700) checkpoint = torch.load(model_file, map_location='cpu') arch = '{}-{}'.format("resnet", model_depth) print(arch, checkpoint['arch']) assert arch == checkpoint['arch'] if hasattr(model, 'module'): # I think this only for legacy models model.module.load_state_dict(checkpoint['state_dict']) else: model.load_state_dict(checkpoint['state_dict']) model.eval() image_clips = [] files = real_glob(args.input_glob) files = extend_to_length(files, 16) print(files) for f in files: img = Image.open(f).convert("RGB") image_clips.append(img) # print("EARLY", image_clips[0][0:4,0:4,0]) mean = [0.4345, 0.4051, 0.3775] std = [0.2768, 0.2713, 0.2737] normalize = Normalize(mean, std) sample_size = 112 spatial_transform = [Resize(sample_size)] spatial_transform.append(CenterCrop(sample_size)) spatial_transform.append(ToTensor()) spatial_transform.extend([ScaleValue(1), normalize]) spatial_transform = Compose(spatial_transform) # c = spatial_transform(image_clips[0]) # c.save("raw.png") model_clips = [] clip = [spatial_transform(img) for img in image_clips] model_clips.append(torch.stack(clip, 0).permute(1, 0, 2, 3)) model_clips = torch.stack(model_clips, 0) print("Final", model_clips.shape) print("PEEK", model_clips[0, 0, 0, 0:4, 0:4]) with torch.no_grad(): outputs = model(model_clips) print(outputs[0][0:10]) outputs = F.softmax(outputs, dim=1).cpu() sorted_scores, locs = torch.topk(outputs[0], k=3) print(locs[0]) video_results = [] for i in range(sorted_scores.size(0)): video_results.append({ 'label': magic_labels_700[locs[i].item()], 'score': sorted_scores[i].item() }) print(video_results)
''' inference_crop = 'center' mean = [0.4345, 0.4051, 0.3775] std = [0.2768, 0.2713, 0.2737] no_mean_norm = False no_std_norm = False sample_size = 112 value_scale = 1 input_type = 'rgb' sample_t_stride = 1 sample_duration = 16 inference_stride = 16 #normalize = get_normalize_method(mean, std, no_mean_norm, no_std_norm) normalize = Normalize(mean, std) spatial_transform = [Resize(sample_size)] if inference_crop == 'center': spatial_transform.append(CenterCrop(sample_size)) if input_type == 'flow': spatial_transform.append(PickFirstChannels(n=2)) spatial_transform.append(ToTensor()) spatial_transform.extend([ScaleValue(value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(sample_t_stride)) temporal_transform.append(SlidingWindow(sample_duration, inference_stride)) temporal_transform = TemporalCompose(temporal_transform) # 加载模型
def main(): global args global best_prec1 args = parser.parse_args() print('Training arguments:') for k, v in vars(args).items(): print('\t{}: {}'.format(k, v)) if args.data_name == 'ucf101': num_class = 101 elif args.data_name == 'hmdb51': num_class = 51 elif args.data_name == 'mine': num_class = 2 else: raise ValueError('Unknown dataset ' + args.data_name) model = Model(num_class, args.num_segments, args.representation, base_model=args.arch) print(model) if 'resnet3D' in args.arch: train_crop_min_ratio = 0.75 train_crop_min_scale = 0.25 mean = [0.4345, 0.4051, 0.3775] std = [0.2768, 0.2713, 0.2737] value_scale = 1 train_transform = Compose([ RandomResizedCrop( model.crop_size, (train_crop_min_scale, 1.0), (train_crop_min_ratio, 1.0 / train_crop_min_ratio)), RandomHorizontalFlip(), ToTensor(), ScaleValue(value_scale), Normalize(mean, std) ]) test_trainsform = Compose([ Resize(model.crop_size), CenterCrop(model.crop_size), ToTensor(), # range [0, 255] -> [0.0,1.0] ScaleValue(1), Normalize(mean, std) ]) train_loader = torch.utils.data.DataLoader( CoviarDataSet( args.data_root, args.data_name, video_list=args.train_list, num_segments=args.num_segments, representation=args.representation, transform=model.get_augmentation(), #train_transform, is_train=True, accumulate=(not args.no_accumulation), model_name=args.arch), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, worker_init_fn=worker_init_fn) val_loader = torch.utils.data.DataLoader( CoviarDataSet( args.data_root, args.data_name, video_list=args.test_list, num_segments=args.num_segments, representation=args.representation, transform=torchvision.transforms.Compose([ GroupScale(int(model.scale_size)), GroupCenterCrop(model.crop_size) ]), #test_trainsform, is_train=True, accumulate=(not args.no_accumulation), model_name=args.arch), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, worker_init_fn=worker_init_fn) model = torch.nn.DataParallel(model, device_ids=args.gpus).cuda() cudnn.benchmark = True params_dict = dict(model.named_parameters()) params = [] for key, value in params_dict.items(): decay_mult = 0.0 if 'bias' in key else 1.0 if ('module.base_model.conv1' in key or 'module.base_model.bn1' in key or 'data_bn' in key) and args.representation in ['mv', 'residual']: lr_mult = 0.1 elif '.fc.' in key: lr_mult = 1.0 else: lr_mult = 0.01 params += [{ 'params': value, 'lr': args.lr, 'lr_mult': lr_mult, 'decay_mult': decay_mult }] #optimizer = torch.optim.SGD(params, weight_decay=0.001, momentum=0.9, nesterov=False) #scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10) optimizer = torch.optim.Adam(params, weight_decay=args.weight_decay, eps=0.001) criterion = torch.nn.CrossEntropyLoss().cuda() for epoch in range(args.epochs): cur_lr = adjust_learning_rate(optimizer, epoch, args.lr_steps, args.lr_decay) #cur_lr = get_lr(optimizer) train(train_loader, model, criterion, optimizer, epoch, cur_lr) #prec1, prev_val_loss = validate(val_loader, model, criterion) #scheduler.step(prev_val_loss) if epoch % args.eval_freq == 0 or epoch == args.epochs - 1: prec1, _ = validate(val_loader, model, criterion) # 紀錄訓練歷程 np.savez("train_history/train_history.npz", loss=np.array(train_loss), top1=np.array(train_prec), lr=np.array(train_lr)) np.savez("train_history/valid_history.npz", loss=np.array(valid_loss), top1=np.array(valid_prec)) is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if is_best or epoch % SAVE_FREQ == 0: save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename='checkpoint.pth.tar')
model.fc = fc print('Model Loaded...') model.load_state_dict(torch.load("./outputs/fight_reco_3DCNNmodel.pth")) print('Loaded model state_dict...') device = torch.device('cuda:0') model.to(device) value_scale = 1 mean = [0.4345, 0.4051, 0.3775] std = [0.2768, 0.2713, 0.2737] sample_size = 112 # resolution of frame spatial_transform = Compose([Resize(sample_size), CenterCrop(sample_size), ToTensor(), ScaleValue(value_scale), Normalize(mean, std)]) VIDEO_PATH = "./input/test_data/fi038.mp4" cap = cv2.VideoCapture(VIDEO_PATH) if (cap.isOpened() == False): print('Error while trying to read video. Plese check again...') # get the frame width and height frame_width = int(cap.get(3)) frame_height = int(cap.get(4))
def get_val_utils(opt): normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) spatial_transform = [ Resize(opt.sample_size), CenterCrop(opt.sample_size), ToTensor() ] if opt.input_type == 'flow': spatial_transform.append(PickFirstChannels(n=2)) spatial_transform.extend([ScaleValue(opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) temporal_transform.append( TemporalEvenCrop(opt.sample_duration, opt.n_val_samples)) temporal_transform = TemporalCompose(temporal_transform) val_data_checkpoint_path = opt.result_path / Path('val_data_' + opt.dataset + '.data') val_collate_checkpoint_path = opt.result_path / Path('val_coll_' + opt.dataset + '.data') if os.path.exists(val_data_checkpoint_path) and os.path.exists( val_collate_checkpoint_path) and opt.save_load_data_checkpoint: with open(val_data_checkpoint_path, 'rb') as filehandle: val_data = pickle.load(filehandle) with open(val_collate_checkpoint_path, 'rb') as filehandle: collate_fn = pickle.load(filehandle) else: val_data, collate_fn = get_validation_data( opt.video_path, opt.annotation_path, opt.dataset, opt.input_type, opt.file_type, spatial_transform, temporal_transform) if opt.save_load_data_checkpoint: with open(val_data_checkpoint_path, 'wb') as filehandle: pickle.dump(val_data, filehandle) with open(val_collate_checkpoint_path, 'wb') as filehandle: pickle.dump(collate_fn, filehandle) if opt.distributed: val_sampler = torch.utils.data.distributed.DistributedSampler( val_data, shuffle=False) else: val_sampler = None val_loader = torch.utils.data.DataLoader(val_data, batch_size=(opt.batch_size // opt.n_val_samples), shuffle=False, num_workers=opt.n_threads, pin_memory=True, sampler=val_sampler, worker_init_fn=worker_init_fn, collate_fn=collate_fn) if opt.is_master_node: val_logger = Logger(opt.result_path / 'val.log', ['epoch', 'loss', 'acc']) else: val_logger = None return val_loader, val_logger
def get_train_utils(opt, model_parameters): assert opt.train_crop in ['random', 'corner', 'center'] spatial_transform = [] if opt.train_crop == 'random': spatial_transform.append( RandomResizedCrop( opt.sample_size, (opt.train_crop_min_scale, 1.0), (opt.train_crop_min_ratio, 1.0 / opt.train_crop_min_ratio))) elif opt.train_crop == 'corner': scales = [1.0] scale_step = 1 / (2**(1 / 4)) for _ in range(1, 5): scales.append(scales[-1] * scale_step) spatial_transform.append(MultiScaleCornerCrop(opt.sample_size, scales)) elif opt.train_crop == 'center': spatial_transform.append(Resize(opt.sample_size)) spatial_transform.append(CenterCrop(opt.sample_size)) normalize = get_normalize_method(opt.mean, opt.std, opt.no_mean_norm, opt.no_std_norm) if not opt.no_hflip: spatial_transform.append(RandomHorizontalFlip()) spatial_transform.append(ToArray()) if opt.colorjitter: spatial_transform.append(ColorJitter()) if opt.input_type == 'flow': spatial_transform.append(PickFirstChannels(n=2)) spatial_transform.append(ScaleValue(opt.value_scale)) spatial_transform.append(normalize) spatial_transform = Compose(spatial_transform) assert opt.train_t_crop in ['random', 'center'] temporal_transform = [] if opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(opt.sample_t_stride)) if opt.train_t_crop == 'random': temporal_transform.append(TemporalRandomCrop(opt.sample_duration)) elif opt.train_t_crop == 'center': temporal_transform.append(TemporalCenterCrop(opt.sample_duration)) temporal_transform = TemporalCompose(temporal_transform) train_data = get_training_data(opt.video_path, opt.annotation_path, opt.dataset, opt.input_type, opt.file_type, spatial_transform, temporal_transform) train_loader = paddle.batch(train_data.reader, batch_size=opt.batch_size) train_logger = Logger(opt.result_path / 'train.log', ['epoch', 'loss', 'acc', 'lr']) train_batch_logger = Logger( opt.result_path / 'train_batch.log', ['epoch', 'batch', 'iter', 'loss', 'acc', 'lr']) assert opt.lr_scheduler in ['plateau', 'multistep'] assert not (opt.lr_scheduler == 'plateau' and opt.no_val) if opt.lr_scheduler == 'plateau': scheduler = ReduceLROnPlateau(learning_rate=opt.learning_rate, mode='min', patience=opt.plateau_patience) else: scheduler = MultiStepDecay(learning_rate=opt.learning_rate, milestones=opt.multistep_milestones) optimizer = fluid.optimizer.MomentumOptimizer( learning_rate=scheduler, momentum=opt.momentum, parameter_list=model_parameters, use_nesterov=opt.nesterov, regularization=fluid.regularizer.L2Decay( regularization_coeff=opt.weight_decay)) return (train_loader, train_logger, train_batch_logger, optimizer, scheduler)
def score(self): normalize = get_normalize_method(self.opt.mean, self.opt.std, self.opt.no_mean_norm, self.opt.no_std_norm) spatial_transform = [ Resize(self.opt.sample_size), CenterCrop(self.opt.sample_size), ToTensor() ] spatial_transform.extend([ScaleValue(self.opt.value_scale), normalize]) spatial_transform = Compose(spatial_transform) temporal_transform = [] if self.opt.sample_t_stride > 1: temporal_transform.append(TemporalSubsampling(self.opt.sample_t_stride)) temporal_transform.append( TemporalEvenCrop(self.opt.sample_duration, self.opt.n_val_samples)) temporal_transform = TemporalCompose(temporal_transform) frame_count = get_n_frames(self.opt.video_jpgs_dir_path) frame_indices = list(range(0, frame_count)) frame_indices = temporal_transform(frame_indices) spatial_transform.randomize_parameters() image_name_formatter = lambda x: f'image_{x:05d}.jpg' loader = VideoLoader(image_name_formatter) print('frame_indices', frame_indices) #clips = [] video_outputs = [] model = generate_model(self.opt) model = load_pretrained_model(model, self.opt.pretrain_path, self.opt.model, self.opt.n_finetune_classes) i =0 for frame_indice in frame_indices: print("%d indice: %s" % (i, str(frame_indice))) i+=1 clip = loader(self.opt.video_jpgs_dir_path, frame_indice) clip = [spatial_transform(img) for img in clip] clip = torch.stack(clip, 0).permute(1, 0, 2, 3) #parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) #print('clips:', clips) #for clip in clips: with torch.no_grad(): print(clip.shape) output = model(torch.unsqueeze(clip, 0)) output = F.softmax(output, dim=1).cpu() #print(output) video_outputs.append(output[0]) del clip video_outputs = torch.stack(video_outputs) average_scores = torch.mean(video_outputs, dim=0) #inference_loader, inference_class_names = main.get_inference_utils(self.opt) with self.opt.annotation_path.open('r') as f: data = json.load(f) class_to_idx = get_class_labels(data) idx_to_class = {} for name, label in class_to_idx.items(): idx_to_class[label] = name print(idx_to_class) inference_result = inference.get_video_results( average_scores, idx_to_class, self.opt.output_topk) print(inference_result)