def get_video_transform(data_name, split_name, opt): normalizer = video_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) t_list = [] if split_name == 'train': t_list = [ video_transforms.RandomResizedCrop(opt.crop_size), video_transforms.RandomHorizontalFlip(), video_transforms.ColorJitter(brightness=0.1, contrast=0.1, hue=0.1) ] else: t_list = [ video_transforms.Resize(256), video_transforms.CenterCrop(opt.crop_size) ] t_end = [video_transforms.ToTensor(), normalizer] transform = video_transforms.Compose(t_list + t_end) return transform
} assert CONFIG["RGB_I3D_LOAD_MODEL_PATH"] or CONFIG[ "FLOW_I3D_LOAD_MODEL_PATH"] # Setup Dataset and Dataloader if CONFIG["DATASET"] == "original": Dataset = bbdb_dataset.OriginalBBDBDataset elif CONFIG["DATASET"] == "binary": Dataset = bbdb_dataset.BinaryBBDBDataset else: assert False with open("data_split.min.json", "r") as fp: data_split = json.load(fp) test_transforms = transforms.Compose([ video_transforms.Resize(256), video_transforms.CenterCrop(224), ]) dataset = Dataset(segment_filepaths=data_split["test"], segment_length=CONFIG["SEGMENT_LENGTH"], frameskip=CONFIG["FRAMESKIP"], transform=test_transforms) dataloader = DataLoader(dataset, batch_size=CONFIG["BATCH_SIZE"], pin_memory=True) # Setup I3D # TODO(seungjaeryanlee): Allow choosing both if CONFIG["RGB_I3D_LOAD_MODEL_PATH"]: rgb_i3d = InceptionI3d(400, in_channels=3) rgb_i3d.replace_logits(dataset.NUM_LABELS)
import os import shutil import math import numpy as np from PIL import Image from tsn_dataset import TSNDataSet from p3d_model import P3D199,get_optim_policies import video_transforms from tsn_models import TSN from torch.nn.utils import clip_grad_norm val_transform=video_transforms.Compose( [ video_transforms.Resize((182,242)), video_transforms.CenterCrop(160), video_transforms.ToTensor(), video_transforms.Normalize((0.485,0.456,0.406), (0.229,0.224,0.225))] ) val_loader=torch.utils.data.DataLoader( TSNDataSet("","tsntest_01.lst", num_segments=2, new_length=16, modality="RGB", image_tmpl="frame{:06d}.jpg", transform=val_transform, random_shift=False), batch_size=1,
def main(): global args, best_acc1 args = parser.parse_args() num_classes = args.num_classes start_epoch=0 writer = SummaryWriter(args.logdir) model = build_model(num_classes=num_classes, input_length=args.new_length) print(model) # create model print("Building model ... ") model = torch.nn.DataParallel(model) model.cuda() if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) print("Saving everything to directory %s." % (args.out_dir)) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = ReduceLROnPlateau(optimizer, verbose=True, patience=4) # if resume set to True, load the model and continue training if args.resume or args.evaluate: if os.path.isfile(args.model_path): model, optimizer, start_epoch = load_checkpoint(model, optimizer, args.model_path) cudnn.benchmark = True is_color = True # scale_ratios = [1.0, 0.875, 0.75, 0.66] clip_mean = {'rgb': [0.485, 0.456, 0.406] * args.new_length, 'flow': [0.9432, 0.9359, 0.9511] *args.new_length, 'skeleton': [0.0071, 0.0078, 0.0079]*args.new_length} clip_std = {'rgb': [0.229, 0.224, 0.225] * args.new_length, 'flow': [0.0788, 0.0753, 0.0683] * args.new_length, 'skeleton': [0.0581, 0.0621, 0.0623] * args.new_length} normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) train_transform = video_transforms.Compose([ video_transforms.Resize((args.new_width, args.new_height)), video_transforms.ToTensor(), normalize, ]) val_transform = video_transforms.Compose([ video_transforms.Resize((args.new_width, args.new_height)), video_transforms.ToTensor(), normalize, ]) train_dataset = datasets.__dict__[args.dataset](root=args.data, source=args.train_split_file, phase="train", is_color=is_color, new_length=args.new_length, video_transform=train_transform) val_dataset = datasets.__dict__[args.dataset](root=args.data, source=args.test_split_file, phase="val", is_color=is_color, new_length=args.new_length, video_transform=val_transform, return_id=True) print('{} samples found, {} train samples and {} test samples.'.format(len(val_dataset)+len(train_dataset), len(train_dataset), len(val_dataset))) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, epoch=0, writer=writer, classes=val_dataset.classes) return for epoch in range(start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, writer) # evaluate on validation set acc1, loss = validate(val_loader, model, criterion, epoch, writer) scheduler.step(loss, epoch=epoch) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) save_checkpoint({ 'epoch': epoch + 1, 'arch': 'ThreeStreamTemporal', 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), }, is_best, 'last_checkpoint.pth.tar', args.out_dir) writer.close()
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args = parser.parse_args() set_logger(log_file=args.log_file, debug_mode=args.debug_mode) torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) cudnn.benchmark = True mean = [124 / 255, 117 / 255, 104 / 255] std = [1 / (.0167 * 255)] * 3 normalize = transforms.Normalize(mean=mean, std=std) train_loader = VideoIterTrain( dataset_path=args.dataset_path, annotation_path=args.annotation_path, clip_length=args.clip_length, frame_interval=args.train_frame_interval, video_transform=transforms.Compose([ transforms.Resize((256, 256)), transforms.RandomCrop((224, 224)), transforms.ToTensor(), normalize, ]), name='train', return_item_subpath=False, ) train_iter = torch.utils.data.DataLoader( train_loader, batch_size=args.batch_size, shuffle=False, num_workers=32, # 4, # change this part accordingly pin_memory=True) val_loader = VideoIterTrain( dataset_path=args.dataset_path, annotation_path=args.annotation_path_test, clip_length=args.clip_length, frame_interval=args.val_frame_interval, video_transform=transforms.Compose([ transforms.Resize((256, 256)), transforms.RandomCrop((224, 224)), transforms.ToTensor(), normalize, ]), name='val', return_item_subpath=False, ) val_iter = torch.utils.data.DataLoader( val_loader, batch_size=args.batch_size, shuffle=False, num_workers=32, # 4, # change this part accordingly pin_memory=True) network = C3D(pretrained=args.pretrained_3d) network.to(device) if not path.exists(features_dir): mkdir(features_dir) features_writer = FeaturesWriter() for i_batch, (data, target, sampled_idx, dirs, vid_names) in tqdm(enumerate(train_iter)): data = data.to(device) with torch.no_grad(): input_var = torch.autograd.Variable(data) outputs = network(input_var) for i, (dir, vid_name, start_frame) in enumerate( zip(dirs, vid_names, sampled_idx.cpu().numpy())): dir = path.join(features_dir, dir) features_writer.write(feature=outputs[i], video_name=vid_name, start_frame=start_frame, dir=dir) features_writer.dump() features_writer = FeaturesWriter() for i_batch, (data, target, sampled_idx, dirs, vid_names) in tqdm(enumerate(val_iter)): data = data.to(device) with torch.no_grad(): input_var = torch.autograd.Variable(data) outputs = network(input_var) for i, (dir, vid_name, start_frame) in enumerate( zip(dirs, vid_names, sampled_idx.cpu().numpy())): dir = path.join(features_dir, dir) features_writer.write(feature=outputs[i], video_name=vid_name, start_frame=start_frame, dir=dir) features_writer.dump()
def extract_from_three_stream(args): model = main_three_stream.build_model(num_classes=args.num_classes, input_length=args.new_length) # create model print("Building model ... ") model = torch.nn.DataParallel(model) model = model.to(args.device) # define loss function (criterion) and optimizer if os.path.isfile(args.vision_model_path): model, _, start_epoch = main_three_stream.load_checkpoint( model, None, args.vision_model_path) is_color = True clip_mean = { 'rgb': [0.485, 0.456, 0.406] * args.new_length, 'flow': [0.9432, 0.9359, 0.9511] * args.new_length, 'skeleton': [0.0071, 0.0078, 0.0079] * args.new_length } clip_std = { 'rgb': [0.229, 0.224, 0.225] * args.new_length, 'flow': [0.0788, 0.0753, 0.0683] * args.new_length, 'skeleton': [0.0581, 0.0621, 0.0623] * args.new_length } normalize = video_transforms.Normalize(mean=clip_mean, std=clip_std) train_transform = video_transforms.Compose([ video_transforms.Resize((args.new_width, args.new_height)), video_transforms.ToTensor(), normalize, ]) val_transform = video_transforms.Compose([ video_transforms.Resize((args.new_width, args.new_height)), video_transforms.ToTensor(), normalize, ]) train_dataset = datasets.__dict__[args.dataset]( root=args.data, source=args.train_split_file, phase="train", is_color=is_color, new_length=args.new_length, video_transform=train_transform, return_id=True) val_dataset = datasets.__dict__[args.dataset]( root=args.data, source=args.test_split_file, phase="val", is_color=is_color, new_length=args.new_length, video_transform=val_transform, return_id=True) print('{} samples found, {} train samples and {} test samples.'.format( len(val_dataset) + len(train_dataset), len(train_dataset), len(val_dataset))) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=args.workers, pin_memory=True) print("Extracting train visual representations") outputs_clip_train = infer_three_stream(train_loader, model, classes=val_dataset.classes) pickle.dump(outputs_clip_train, open(args.visual_representations_train, 'wb')) print("Extracting validation visual representations") outputs_clip_val = infer_three_stream(val_loader, model, classes=val_dataset.classes) pickle.dump(outputs_clip_val, open(args.visual_representations_val, 'wb')) return outputs_clip_train, outputs_clip_val