def run(video, mode='rgb', weights='weights/unproc_bs4_456225.pt', num_classes=1042): class_map = make_label_map() if type(video) == str: data = prepare_data_mp4(video) else: data = prepare_data(video) # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('weights/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt')) i3d.replace_logits(num_classes) i3d.load_state_dict(torch.load(weights)) i3d.cuda() i3d = nn.DataParallel(i3d) i3d.eval() preds = [] inputs, labels, video_id = data per_frame_logits = i3d(inputs) predictions = torch.max(per_frame_logits, dim=2)[0] out_labels = np.argsort(predictions.cpu().detach().numpy()[0]) out_probs = np.sort(predictions.cpu().detach().numpy()[0]) print(class_map[out_labels[-1]]) return class_map[out_labels[-1]]
def __init__(self, num_classes): super(TAL_Net, self).__init__() self.num_classes = num_classes self.I3D_1 = InceptionI3d(3, in_channels=3) self.I3D_2 = InceptionI3d(3, in_channels=3) # for param in self.I3D.parameters(): # param.requires_grad = False self.dropout = nn.Dropout(p=0.5) self.predictor = nn.Sequential( Unit3D(in_channels=2*(384 + 384 + 128 + 128), output_channels=256, kernel_shape=[1, 1, 1], name='layer1'), nn.Dropout(p=0.5), Unit3D(in_channels=256, output_channels=self.num_classes + 2, kernel_shape=[1, 1, 1], activation_fn=None, use_batch_norm=False, use_bias=True, name='layer2') ) # self.predictor = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self.num_classes+2, # kernel_shape=[1, 1, 1], # activation_fn=None, # use_batch_norm=False, # use_bias=True) self.predictor.apply(weight_init)
def run(max_steps=64e3, mode='rgb', root='', split='', batch_size=1, load_model='', save_dir=''): # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(split, 'testing', root, mode, test_transforms, num=-1, save_dir=save_dir) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=True) # val_dataset = Dataset(split, 'testing', root, mode, test_transforms, num=-1, save_dir=save_dir) # val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) dataloaders = {'train': dataloader}#, 'val': val_dataloader} datasets = {'train': dataset}#, 'val': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) else: i3d = InceptionI3d(400, in_channels=3) i3d.replace_logits(400) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() for phase in ['train']: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 # Iterate over data. for data in dataloaders[phase]: # get the inputs inputs, labels, name = data if os.path.exists(os.path.join(save_dir, name[0]+'.npy')): continue i=0 for input in inputs: i+=1 b,c,t,h,w = input.shape if t > 1600: features = [] for start in range(1, t-56, 1600): end = min(t-1, start+1600+56) start = max(1, start-48) ip = Variable(torch.from_numpy(input.numpy()[:,:,start:end]).cuda(), volatile=True) features.append(i3d.extract_features(ip).squeeze(0).permute(1,2,3,0).data.cpu().numpy()) np.save(os.path.join(save_dir, name[0]), np.concatenate(features, axis=0)) else: # wrap them in Variable input = Variable(input.cuda(), volatile=True) features = i3d.extract_features(input) new_path = os.path.join(save_dir, name[0], mode) if not os.path.exists(new_path): os.makedirs(new_path) np.save(os.path.join(new_path, str(i)), features.squeeze(0).permute(1,2,3,0).data.cpu().numpy())
def model_builder(): # setup the model if args.model == 'i3d': if args.mode == 'flow': model = InceptionI3d(num_classes=7, in_channels=2) model.load_state_dict( { k: v for k, v in torch.load('models/flow_imagenet.pt').items() if k.find('logits') < 0 }, strict=False) else: model = InceptionI3d(num_classes=7, in_channels=3, dropout_keep_prob=0.5) model.load_state_dict( { k: v for k, v in torch.load('models/rgb_imagenet.pt').items() if k.find('logits') < 0 }, strict=False) elif args.model == 'r2plus1d': model = R2Plus1DClassifier(num_classes=7) elif args.model == 'w3d': model = W3D(num_classes=7) # model.load_state_dict(torch.load('pev_i3d_best.pt')) if args.resume is not None: # Use a local scope to avoid dangling references def resume(): if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage) model.load_state_dict(checkpoint) print("=> loaded checkpoint '{}' ".format(args.resume)) else: print("=> no checkpoint found at '{}'".format(args.resume)) resume() model = model.cuda() if args.distributed: lr = args.lr * args.batch_size * args.world_size / 64. else: lr = args.lr * args.batch_size / 56. # lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [30, 60]) if args.distributed: model = DDP(model) else: model = nn.DataParallel(model) return model
def run(max_steps=64e3, mode='rgb', root='/gpfs/home/lhe/xuexinwei/xxw/super-events-cvpr18-master/data/charades/Charades_v1_rgb', split='/gpfs/home/lhe/xuexinwei/xxw/super-events-cvpr18-master/data/charades/charades.json', batch_size=1, load_model='/gpfs/home/lhe/xuexinwei/xxw/super-events-cvpr18-master/pytorch-i3d/models/rgb_charades.pt', save_dir='/gpfs/home/lhe/xuexinwei/xxw/super-events-cvpr18-master/data/charades/charades_features'): # setup dataset #root = '/ssd2/charades/Charades_v1_rgb', split = 'charades/charades.json', batch_size = 1, load_model = '', save_dir = '' # root = '/gpfs/home/lhe/xxw/xxw/super-events-cvpr18-master/data/charades/Charades_v1_rgb' test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) # print ( mode,root,split,batch_size) dataset = Dataset(split, 'training', root, mode, test_transforms, save_dir=save_dir) #num=-1, dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) val_dataset = Dataset(split, 'testing', root, mode, test_transforms, save_dir=save_dir)#num=-1, val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) else: i3d = InceptionI3d(400, in_channels=3) i3d.replace_logits(157) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() for phase in ['train', 'val']: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 # Iterate over data. for data in dataloaders[phase]: # get the inputs inputs, labels, name = data if os.path.exists(os.path.join(save_dir, name[0]+'.npy')): continue b,c,t,h,w = inputs.shape if t > 1600: features = [] for start in range(1, t-56, 1600): end = min(t-1, start+1600+56) start = max(1, start-48) ip = Variable(torch.from_numpy(inputs.numpy()[:,:,start:end]).cuda(), volatile=True) features.append(i3d.extract_features(ip).squeeze(0).permute(1,2,3,0).data.cpu().numpy()) np.save(os.path.join(save_dir, name[0]), np.concatenate(features, axis=0)) else: # wrap them in Variable inputs = Variable(inputs.cuda(), volatile=True) features = i3d.extract_features(inputs) np.save(os.path.join(save_dir, name[0]), features.squeeze(0).permute(1,2,3,0).data.cpu().numpy())
def run(init_lr=0.1, max_steps=64e3, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', batch_size=3 * 15, save_model='', weights=None, num_classes=0): # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) data = make_eval_json() class_map = make_label_map() val_dataset = Dataset(train_split, 'test', root, mode, data, num_classes, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=False, num_workers=2, pin_memory=False) dataloaders = {'test': val_dataloader} datasets = {'test': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('weights/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt')) i3d.replace_logits(num_classes) i3d.load_state_dict( torch.load(weights) ) # nslt_2000_000700.pt nslt_1000_010800 nslt_300_005100.pt(best_results) nslt_300_005500.pt(results_reported) nslt_2000_011400 #i3d.cuda() #i3d = nn.DataParallel(i3d) i3d.eval() preds = [] for data in dataloaders["test"]: inputs, labels, video_id = data # inputs: b, c, t, h, w per_frame_logits = i3d(inputs) predictions = torch.max(per_frame_logits, dim=2)[0] out_labels = np.argsort(predictions.cpu().detach().numpy()[0]) out_probs = np.sort(predictions.cpu().detach().numpy()[0]) print(class_map[out_labels[-1]]) preds.append(class_map[out_labels[-1]]) return preds
def run(mode='rgb', batch_size=4, load_model=''): device = torch.device('cuda') # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2, spatial_size=112) else: i3d = InceptionI3d(400, in_channels=3, spatial_size=112) sd = torch.load(load_model) i3d.load_state_dict(sd) i3d.to(device) data = torch.rand((4,3,32,112,112)).to(device) print(i3d(data))
def run_on_tensor(weights, ip_tensor, num_classes): i3d = InceptionI3d(400, in_channels=3) # i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) i3d.replace_logits(num_classes) i3d.load_state_dict(torch.load(weights)) # nslt_2000_000700.pt nslt_1000_010800 nslt_300_005100.pt(best_results) nslt_300_005500.pt(results_reported) nslt_2000_011400 i3d.cuda() i3d = nn.DataParallel(i3d) i3d.eval() t = ip_tensor.shape[2] ip_tensor.cuda() per_frame_logits = i3d(ip_tensor) predictions = F.upsample(per_frame_logits, t, mode='linear') predictions = predictions.transpose(2, 1) out_labels = np.argsort(predictions.cpu().detach().numpy()[0]) arr = predictions.cpu().detach().numpy()[0,:,0].T plt.plot(range(len(arr)), F.softmax(torch.from_numpy(arr), dim=0).numpy()) plt.show() return out_labels
def run(max_steps=64e3,load_model='',root='/l/vision/v7/wang617/taiwan', batch_size=1, save_dir=''): # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(root,test_transforms, save_dir=save_dir) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=8) i3d = InceptionI3d(400, in_channels=3) #i3d.replace_logits(157) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() i3d.train(False) # Set model to evaluate mode count = 0 start = time.time() for data in dataloader: # get the inputs inputs, label, name = data label = str(label.numpy()[0]) b,c,t,h,w = inputs.shape inputs = Variable(inputs.cuda(), volatile=True) features = i3d.extract_features(inputs) np.save(os.path.join(save_dir,name[0]),features.squeeze().data.cpu().numpy()) f = open('/l/vision/v7/wang617/taiwan_data/i3d_feature_list.txt','a') f.writelines([name[0],',',label,'\n']) count = count +1 if count%100 ==0: current = time.time() print('Count {:2},|' 'running time:{:.2f} sec'.format(count,current-start)) f.close()
def run(dataloaders, num_classes=42): i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) i3d.replace_logits(num_classes) i3d.load_state_dict(torch.load(LOAD_MODEL_LOC)) i3d.cuda() i3d.train(False) count = 0 for phase in ['train', 'val']: i3d.train(False) # Set model to evaluate mode # Iterate over data. for data in tqdm(dataloaders[phase]): # get the inputs inputs, labels, feature_path, nf = data count += 1 if os.path.exists(feature_path[0] + '/i3d_040' + '.npy'): continue os.makedirs(feature_path[0], exist_ok=True) b, c, t, h, w = inputs.shape print_log(f'shape:{b},{c},{t},{h},{w}') print_log('path:' + feature_path[0]) # b,c,t,h,w = inputs.shape print_log(f'count:{count}') print_log(f'num_frames:{nf}') time_a = time.time() if nf > 1000: features = [] for start in range(0, nf, 1000): end = start + 1000 if end > nf - 1: end = nf - 1 with torch.no_grad(): ip = Variable( torch.from_numpy(inputs.numpy()[:, :, start:end]).cuda()) features.append( i3d.extract_features(ip).squeeze(0).permute( 1, 2, 3, 0).data.cpu().numpy()) np.save(os.path.join(feature_path[0], 'i3d_040'), np.concatenate(features, axis=0)) else: # wrap them in Variable with torch.no_grad(): inputs = Variable(inputs.cuda()) features = i3d.extract_features(inputs) np.save( os.path.join(feature_path[0], 'i3d_040'), features.squeeze(0).permute(1, 2, 3, 0).data.cpu().numpy()) time_b = time.time() print_log(f'time consumed:{time_b-time_a}s')
def load_temporal_model(model_name, model_depth): verbose = False #model_name = 'i3d' # resnext resnet i3d if model_name == 'i3d': model_path = '/home/vsharma/Documents/Audio_Visual_Text/models/i3d/rgb_imagenet.pt' model = InceptionI3d(400, in_channels=3) model.load_state_dict(torch.load(model_path)) arch = model_name model.train(False) # Set model to evaluate mode elif (model_name == 'resnet') or (model_name == 'resnext'): #model_depth = 50 # 101 50 arch = '{}-{}'.format(model_name, model_depth) model_path = '/home/vsharma/Documents/Audio_Visual_Text/models/resnet3d' model_path = '{}/{}-kinetics.pth'.format(model_path, arch) if arch == 'resnet-50': model = resnet.resnet50(num_classes=400, shortcut_type='B', sample_size=112, sample_duration=16, last_fc=True) elif arch == 'resnext-101': model = resnext.resnet101(num_classes=400, shortcut_type='B', cardinality=32, sample_size=112, sample_duration=16, last_fc=True) model_data = torch.load(model_path) assert arch == model_data['arch'] #model.load_state_dict(model_data['state_dict']) state_dict = model_data['state_dict'] new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove `module.` new_state_dict[name] = v model.load_state_dict(new_state_dict) # Removing the last 2 layers: fc and softmax model = nn.Sequential(*list(model.children())[:-2]) model.eval() if verbose: print(model) return model
def load_model(input_channels, learning_rate, scheduler_list, checkpoint=None): i3d = InceptionI3d(400, in_channels=input_channels) #i3d = i3d.to(self.device) optimizer = optim.SGD(i3d.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0000001) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, scheduler_list) if (checkpoint is not None): data = torch.load(checkpoint) i3d.load_state_dict(data['model_state']) optimizer.load_state_dict(data['optimizer_state']) scheduler.load_state_dict(data['scheduler_state']) return i3d, optimizer, scheduler
def load_model(learning_rate, scheduler_list, checkpoint=None): sm = InceptionI3d(400, in_channels=3) sm.replace_logits(1) fusedNet = FusionNet(sm) #i3d = i3d.to(self.device) optimizer = optim.SGD(fusedNet.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0000001) scheduler = optim.lr_scheduler.MultiStepLR(optimizer, scheduler_list) if (checkpoint is not None): data = torch.load(checkpoint) fusedNet.load_state_dict(data['model_state']) optimizer.load_state_dict(data['optimizer_state']) scheduler.load_state_dict(data['scheduler_state']) return fusedNet, optimizer, scheduler
def __init__(self, num_outputs=120): super(ProxyNetwork, self).__init__() i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) self.i3d = i3d self.siam_logits = Unit3D(in_channels=112 + 288 + 64 + 64, output_channels=128, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='siam_logits') self.siam_avg_pool = nn.AvgPool3d(kernel_size=[2, 14, 14], stride=(1, 1, 1)) self.fc1 = nn.Linear(128, 512) self.fc2 = nn.Linear(512, num_outputs)
def run(init_lr=0.1, max_step=64e3, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', batch_size=8*5, save_model=''): # setup dataset train_transforms = transforms.Compose([videotransforms.RandomCrop(224), videotransforms.RandomHorisontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.RandomCrop(224)]) dataset = Dataset(train_split, 'training', root, mode, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=36, pin_memory=True) val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=36, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('models/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) i3d.replace_logits(157) #i3d.load_state_dict(torch.load('/ssd/models/000920.pt')) i3d.cuda() i3d = nn.DataParallel(i3d) lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) num_steps_per_update = 4 # accum gradient steps = 0 # train it while steps < max_steps:#for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': i3d.train(True) else: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) per_frame_logits = i3d(inputs) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # compute localization loss loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) tot_loc_loss += loc_loss.data[0] # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.data[0] loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update tot_loss += loss.data[0] loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() lr_sched.step() if steps % 10 == 0: print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10) # save model torch.save(i3d.module.state_dict(), save_model+str(steps).zfill(6)+'.pt') tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'val': print '{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter)
def drive_run(init_lr=0.1, max_steps=64e3, train_loss = [], val_loss = [], mode='gray', root="data/drive_and_act_dataset/simmons_kinect_ir_train/", batch_size=1, save_model=''): root_path = "data/drive_and_act_dataset/simmons_kinect_ir_train/" train_list = "sample_train_list.txt" test_list= "sample_test_list.txt" train_transforms = None test_transforms = None # create a dataset from our DriveAndAct dataset: # !ls train_dataset = DriveAndAct(root_path + train_list, mode='gray', root=root_path, transforms=None) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True) val_dataset = DriveAndAct(root_path + test_list, mode='gray', root=root_path, transforms=None) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True) dataloaders = {'train': train_dataloader, 'val': val_dataloader} datasets = {'train': train_dataset, 'val': val_dataset} num_classes = 39 # setup the model if mode == 'gray': # Load in a pretrained dataset (RGB if in_channels=3, rgb_imagenet) # (Use flow when using our optical flow dataset) # i3d = InceptionI3d(400, in_channels=3) # i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) i3d = InceptionI3d(num_classes, in_channels=3) i3d.load_state_dict(torch.load('002370.pt')) i3d.replace_logits(num_classes) i3d.cuda() i3d = nn.DataParallel(i3d) lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) num_steps_per_update = 4 # accumulating gradients ("virtual" batch size) steps = 0 # train it while steps < max_steps: #for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': i3d.train(True) else: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) per_frame_logits = i3d(inputs) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # compute localization loss loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels) tot_loc_loss += loc_loss.item() # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits(torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.item() loss = (0.5*loc_loss + 0.5*cls_loss)/num_steps_per_update tot_loss += loss.item() loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 # weights don't update until optimizer.step() called optimizer.step() optimizer.zero_grad() lr_sched.step() # make sure model is being saved in case we get kicked off datahub if steps % 10 == 0: print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/(10*num_steps_per_update), tot_cls_loss/(10*num_steps_per_update), tot_loss/10)) # save model train_loss.append(tot_loss/10) torch.save(i3d.module.state_dict(), save_model+str(steps).zfill(6)+'.pt') tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'val': print('{} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}'.format(phase, tot_loc_loss/num_iter, tot_cls_loss/num_iter, (tot_loss*num_steps_per_update)/num_iter)) val_loss.append((tot_loss*num_steps_per_update)/num_iter)
def test(mode='rgb', batch_size=1, accuracy_per_frame = True): correct = 0 total = 0 # setting up dataloaders test_transforms = None val_dataset = DriveAndAct(test_list_path, mode=mode, root=npy_path, transforms=test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True) dataloaders = {'val': val_dataloader} datasets = {'val': val_dataset} drive_num_classes = 39 imagenet_num_classes = 400 # set up model i3d = InceptionI3d(drive_num_classes, in_channels=CHANNELS) # Load in a pretrained dataset (RGB if in_channels=3, rgb_imagenet) # (Use flow when using our optical flow dataset) i3d.load_state_dict(torch.load(model_path)) i3d.replace_logits(drive_num_classes) i3d.cuda() i3d = nn.DataParallel(i3d) y_pred = [] y_true = [] with torch.no_grad(): # Set model to evaluate mode counter = 0 # Iterate over data. for data in tqdm(dataloaders['val']): if counter == 300: break counter += 1 # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) per_frame_logits = i3d(inputs) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # compute accuracy pred_values, pred_indices = per_frame_logits.max(1) actual_values, actual_indices = labels.max(1) pred_indices = pred_indices.squeeze() actual_indices = actual_indices.squeeze() # print statements to see what the predictions are #print(pred_indices[:], actual_indices[:]) # calculates the accuracy for each frame in the npy if accuracy_per_frame: #print(" pred: ", pred_indices, " act: ", actual_indices) matches = torch.eq(pred_indices, actual_indices) #print("matches: ", matches) y_pred.extend(pred_indices.cpu().numpy()) y_true.extend(actual_indices.cpu().numpy()) correct += int(matches.sum()) total += int(matches.shape[0]) else: label = actual_indices[0] # get prediction counts across the all the frames in the segment pred_values, pred_counts = torch.unique(pred_indices, return_counts=True) # prediction is the action that was inferred the most times across the frames _, idx = pred_counts.max(0) pred = pred_values[idx] #print(type(pred_counts), pred_values, pred_counts) #print(pred, label) if pred == label: correct += 1 total += 1 return correct, total, y_pred, y_true
def main(model_name, mode, root, val_split, ckpt, batch_per_gpu): num_gpus = MPI.COMM_WORLD.Get_size() distributed = False if num_gpus > 1: distributed = True local_rank = MPI.COMM_WORLD.Get_rank() % torch.cuda.device_count() if distributed: torch.cuda.set_device(local_rank) host = os.environ["MASTER_ADDR"] if "MASTER_ADDR" in os.environ else "127.0.0.1" torch.distributed.init_process_group( backend="nccl", init_method='tcp://{}:12345'.format(host), rank=MPI.COMM_WORLD.Get_rank(), world_size=MPI.COMM_WORLD.Get_size() ) synchronize() val_dataloader = make_dataloader(root, val_split, mode, model_name, seq_len=16, #64, overlap=8, #32, phase='val', max_iters=None, batch_per_gpu=batch_per_gpu, num_workers=16, shuffle=False, distributed=distributed, with_normal=False) if model_name == 'i3d': if mode == 'flow': model = InceptionI3d(val_dataloader.dataset.num_classes, in_channels=2, dropout_keep_prob=0.5) else: model = InceptionI3d(val_dataloader.dataset.num_classes, in_channels=3, dropout_keep_prob=0.5) model.replace_logits(val_dataloader.dataset.num_classes) elif model_name == 'r3d_18': model = r3d_18(pretrained=False, num_classes=val_dataloader.dataset.num_classes) elif model_name == 'mc3_18': model = mc3_18(pretrained=False, num_classes=val_dataloader.dataset.num_classes) elif model_name == 'r2plus1d_18': model = r2plus1d_18(pretrained=False, num_classes=val_dataloader.dataset.num_classes) elif model_name == 'c3d': model = C3D(pretrained=False, num_classes=val_dataloader.dataset.num_classes) else: raise NameError('unknown model name:{}'.format(model_name)) # pdb.set_trace() for param in model.parameters(): pass device = torch.device('cuda') model.to(device) if distributed: model = apex.parallel.convert_syncbn_model(model) model = DDP(model.cuda(), delay_allreduce=True)
flowDir = rootDir + "flows/" # ava_training_set = "ava_dataset_files/ava_train_v2.1.csv" ava_training_set = "ava_dataset_files/ava_train_truppr_v2class.csv" ava_validation_set = "ava_dataset_files/ava_valid_truppr_v2class.csv" train_data = ava_dataset(ava_training_set, videoDir, flowDir, jsonDir) valid_data = ava_dataset(ava_validation_set, videoDir, flowDir, jsonDir) numClasses = 2; ########## Activity Recognition - EHPI Stream # ehpi_stream = EHPIClassifier(numClasses) # ehpi_stream.cuda(0) ########## Activity Recognition - RGB Stream i3d_RGB = InceptionI3d(157, in_channels=3) # 400 when only loaded with imagenet weights i3d_RGB.load_state_dict(torch.load('models/rgb_charades.pt')) i3d_RGB.replace_logits(numClasses) i3d_RGB.cuda(0) i3d_RGB = nn.DataParallel(i3d_RGB) ########## Activity Recognition - Optical Flow Stream i3d_OF = InceptionI3d(157, in_channels=2) # 400 when only loaded with imagenet weights i3d_OF.load_state_dict(torch.load('models/flow_charades.pt')) i3d_OF.replace_logits(numClasses) i3d_OF.cuda(0) i3d_OF = nn.DataParallel(i3d_OF) # ehpi_stream.train(True); i3d_RGB.train(True); i3d_OF.train(True);
with open(SAVE_DIR + 'info.txt', 'w+') as f: f.write('LR = {}\nBATCH_SIZE = {}\nEPOCHS = {}\n'.format(LR, BATCH_SIZE, EPOCHS)) # Transforms train_transforms = transforms.Compose([videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) # Datasets and Dataloaders train_dataset = Dataset(train_split, 'training', root, mode, train_transforms) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=36, pin_memory=True) val_dataset = Dataset(train_split, 'testing', root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=36, pin_memory=True) dataloaders = {'train': train_dataloader, 'val': val_dataloader} # Load pre-trained I3D model i3d = InceptionI3d(400, in_channels=3) # pre-trained model has 400 output classes i3d.load_state_dict(torch.load('/vision/u/rhsieh91/pytorch-i3d/models/rgb_imagenet.pt')) i3d.replace_logits(NUM_CLASSES) # replace final layer to work with new dataset # Set up optimizer and learning rate schedule optimizer = optim.Adam(i3d.parameters(), lr=LR) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [10, 20], gamma=0.1) # decay learning rate by gamma at epoch 10 and 20 # Start training train(i3d, optimizer, dataloaders, num_classes=NUM_CLASSES, epochs=EPOCHS, save_dir=SAVE_DIR, use_gpu=USE_GPU, lr_sched=lr_sched)
def run(init_lr=0.01, max_steps=200, mode='rgb', root='/media/pritesh/Entertainment/Visual-Tactile_Dataset/dataset/',\ train_split='train.txt', test_split='test.txt', batch_size=1, save_model=''): print(train_split, test_split) writer = tensorboardX.SummaryWriter() # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, root, mode, test_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True) val_dataset = Dataset(test_split, root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model sm = InceptionI3d(400, in_channels=3) sm.replace_logits(1) #add your network here fusedNet = FusionNet(sm) if torch.cuda.is_available(): fusedNet.cuda() fusedNet = nn.DataParallel(fusedNet) lr = init_lr optimizer = optim.SGD(fusedNet.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [50, 100, 150, 200]) if torch.cuda.is_available(): data = torch.load(save_model) else: data = torch.load(save_model, map_location=lambda storage, loc: storage) fusedNet.load_state_dict(data['model_state']) optimizer.load_state_dict(data['optimizer_state']) lr_sched.load_state_dict(data['scheduler_state']) steps = 0 with open('inference_V.txt', 'w') as file: file.write("train and validation loss file\n") # train it # Each epoch has a training and validation phase fusedNet.train(False) # Set model to evaluate mode for phase in ['train', 'val']: print('phase : {}'.format(phase)) tot_cls_loss = 0.0 num_iter = 0 count = 0 # optimizer.zero_grad() with open('inference_V.txt', 'a') as file: file.write("---------------\n") # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs f_vid, l_vid, tactile, pos, labels = data if torch.cuda.is_available(): rgb_inputs = Variable(f_vid.cuda()) t = rgb_inputs.size(2) labels = Variable(labels.cuda()) else: rgb_inputs = Variable(f_vid) t = rgb_inputs.size(2) labels = Variable(labels) out = fusedNet(rgb_inputs.float()) #print('prediction output = ', per_frame_logits.shape) #print('labels = ',labels.shape) # compute classification loss (with max-pooling along time B x C x T) out = out.squeeze(1) cls_loss = F.binary_cross_entropy_with_logits( out.double(), labels.double()) tot_cls_loss += cls_loss.item() # cls_loss.backward() print('{} Loss: {:.4f} and lr: {}'.format(phase, tot_cls_loss / num_iter, init_lr)) with open('inference_V.txt', 'a') as file: file.write("%f\n" % (tot_cls_loss / num_iter)) # optimizer.step() # optimizer.zero_grad() if phase == 'val': writer.add_scalar('inference_error/' + phase, (tot_cls_loss / num_iter), num_iter) else: writer.add_scalar('inference_error/' + phase, (tot_cls_loss / num_iter), num_iter)
def run(mode='rgb', load_model='', sample_mode='oversample', frequency=16, input_dir='', output_dir='', batch_size=4, usezip=False): if not os.path.exists(output_dir): os.makedirs(output_dir) chunk_size = 16 assert(mode in ['rgb', 'flow']) assert(sample_mode in ['oversample', 'center_crop', 'resize']) # setup the model if mode == 'flow': load_model = os.path.join(load_model, 'flow_imagenet.pt') i3d = InceptionI3d(400, in_channels=2) #400 classes representing Kinetics dataset else: load_model = os.path.join(load_model, 'rgb_imagenet.pt') i3d = InceptionI3d(400, in_channels=3) #i3d.replace_logits(157) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() i3d.train(False) # Set model to evaluate mode def forward_batch(b_data): with torch.no_grad(): b_data = b_data.transpose([0, 4, 1, 2, 3]) b_data = torch.from_numpy(b_data) # b,c,t,h,w # ?x3x16x224x224 (for RGB) b_data = Variable(b_data.cuda()).float() b_features = i3d.extract_features(b_data) b_features = b_features.data.cpu().numpy()[:,:,0,0,0] return b_features video_names_list = [] for class_name in os.listdir(input_dir): if os.path.exists(os.path.join(output_dir, class_name).replace('\\', '/')): pass else: os.makedirs(os.path.join(output_dir, class_name).replace('\\', '/')) for vid_name in os.listdir(os.path.join(input_dir, class_name).replace('\\', '/')): video_names_list.append(os.path.join(class_name, vid_name).replace('\\', '/')) for idx, video_name in enumerate(video_names_list): v_name = video_name.split('/')[1] # Only retrieve name of every .mp4 video save_file = '{}-{}.npz'.format(v_name, mode) if os.path.exists(os.path.join(output_dir, video_names_list[idx]).replace('\\', '/')): pass else: os.makedirs(os.path.join(output_dir, video_names_list[idx]).replace('\\', '/')) if save_file in os.listdir(os.path.join(output_dir, video_names_list[idx])): continue frames_dir = os.path.join(input_dir, video_name) if mode == 'rgb': if usezip: rgb_zipdata = zipfile.ZipFile(os.path.join(frames_dir, 'rgb.zip'), 'r') rgb_files = [i for i in rgb_zipdata.namelist() if i.startswith('rgb')] else: frames_dir = os.path.join(frames_dir, mode) rgb_files = [i for i in os.listdir(frames_dir) if i.startswith('rgb')] rgb_files.sort() frame_cnt = len(rgb_files) else: if usezip: flow_x_zipdata = zipfile.ZipFile(os.path.join(frames_dir, 'flow_x.zip'), 'r') flow_x_files = [i for i in flow_x_zipdata.namelist() if i.startswith('flow_x')] flow_y_zipdata = zipfile.ZipFile(os.path.join(frames_dir, 'flow_y.zip'), 'r') flow_y_files = [i for i in flow_y_zipdata.namelist() if i.startswith('flow_y')] else: flowx_dir = os.path.join(frames_dir, 'flow_x') flow_x_files = [i for i in os.listdir(flowx_dir) if i.startswith('flow_x')] flowy_dir = os.path.join(frames_dir, 'flow_y') flow_y_files = [i for i in os.listdir(flowy_dir) if i.startswith('flow_y')] flow_x_files.sort() flow_y_files.sort() assert(len(flow_y_files) == len(flow_x_files)) frame_cnt = len(flow_y_files) # clipped_length = (frame_cnt // chunk_size) * chunk_size # Cut frames # Cut frames assert(frame_cnt > chunk_size) clipped_length = frame_cnt - chunk_size clipped_length = (clipped_length // frequency) * frequency # The start of last chunk frame_indices = [] # Frames to chunks for i in range(clipped_length // frequency + 1): frame_indices.append( [j for j in range(i * frequency, i * frequency + chunk_size)]) frame_indices = np.array(frame_indices) #frame_indices = np.reshape(frame_indices, (-1, 16)) # Frames to chunks chunk_num = frame_indices.shape[0] batch_num = int(np.ceil(chunk_num / batch_size)) # Chunks to batches frame_indices = np.array_split(frame_indices, batch_num, axis=0) if sample_mode == 'oversample': full_features = [[] for i in range(10)] else: full_features = [[]] for batch_id in range(batch_num): require_resize = sample_mode == 'resize' if mode == 'rgb': if usezip: batch_data = load_ziprgb_batch(rgb_zipdata, rgb_files, frame_indices[batch_id], require_resize) else: batch_data = load_rgb_batch(frames_dir, rgb_files, frame_indices[batch_id], require_resize) else: if usezip: batch_data = load_zipflow_batch( flow_x_zipdata, flow_y_zipdata, flow_x_files, flow_y_files, frame_indices[batch_id], require_resize) else: batch_data = load_flow_batch(frames_dir, flow_x_files, flow_y_files, frame_indices[batch_id], require_resize) if sample_mode == 'oversample': batch_data_ten_crop = oversample_data(batch_data) for i in range(10): #pdb.set_trace() assert(batch_data_ten_crop[i].shape[-2]==224) assert(batch_data_ten_crop[i].shape[-3]==224) full_features[i].append(forward_batch(batch_data_ten_crop[i])) else: if sample_mode == 'center_crop': batch_data = batch_data[:,:,16:240,58:282,:] # Center Crop (4, 16, 224, 224, 2) assert(batch_data.shape[-2]==224) assert(batch_data.shape[-3]==224) full_features[0].append(forward_batch(batch_data)) full_features = [np.concatenate(i, axis=0) for i in full_features] full_features = [np.expand_dims(i, axis=0) for i in full_features] full_features = np.concatenate(full_features, axis=0) np.savez(os.path.join(os.path.join(output_dir, video_names_list[idx]), save_file), feature=full_features, frame_cnt=frame_cnt, video_name=v_name) print('{} Extracted features {}: {} / {}, {}'.format( v_name, mode, frame_cnt, clipped_length, full_features.shape))
def run(init_lr=0.01, root='', split_file='data/annotations/charades.json', batch_size=8, save_dir='', stride=4, num_span_frames=32, num_epochs=200): writer = SummaryWriter() # tensorboard logging # setup dataset train_transforms = transforms.Compose( [transforms.Resize((224, 224)), transforms.ToTensor()]) test_transforms = transforms.Compose( [transforms.Resize((224, 224)), transforms.ToTensor()]) print('Getting train dataset...') train_dataset = Dataset(split_file, 'training', root, train_transforms, stride, num_span_frames, is_sife=False) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True) print('Getting validation dataset...') val_dataset = Dataset(split_file, 'testing', root, test_transforms, stride, num_span_frames, is_sife=False) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True) dataloaders = {'train': train_dataloader, 'val': val_dataloader} print('Loading model...') # setup the model i3d = InceptionI3d(400, in_channels=3) if args.checkpoint_path: i3d.replace_logits(157) state_dict = torch.load(args.checkpoint_path)['model_state_dict'] checkpoint = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove 'module' checkpoint[name] = v i3d.load_state_dict(checkpoint) else: i3d.load_state_dict(torch.load('models/rgb_imagenet.pt')) i3d.replace_logits(157) i3d.cuda() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') if torch.cuda.device_count() > 1: print('Using {} GPUs'.format(torch.cuda.device_count())) i3d = nn.DataParallel(i3d) i3d.to(device) print('Loaded model.') optimizer = optim.Adam(i3d.parameters(), lr=init_lr) #lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [30], gamma=0.1) steps = 0 if not args.checkpoint_path else torch.load( args.checkpoint_path)['steps'] start_epoch = 0 if not args.checkpoint_path else torch.load( args.checkpoint_path)['epoch'] # TRAIN for epoch in range(start_epoch, num_epochs): print('-' * 50) print('EPOCH {}/{}'.format(epoch, num_epochs)) print('-' * 50) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': i3d.train(True) print('-' * 10, 'TRAINING', '-' * 10) else: i3d.train(False) # Set model to evaluate mode print('-' * 10, 'VALIDATION', '-' * 10) # Iterate over data. all_preds = [] all_labels = [] print('Entering data loading...') for i, data in enumerate(dataloaders[phase]): # get the inputs inputs, labels, vid = data t = inputs.shape[2] inputs = inputs.cuda() labels = labels.cuda() if phase == 'train': per_frame_logits = i3d(inputs) else: with torch.no_grad(): per_frame_logits = i3d(inputs) # upsample to input size per_frame_logits = F.interpolate( per_frame_logits, t, mode='linear') # B x Classes x T max_frame_logits = torch.max(per_frame_logits, dim=2)[0] # B x Classes labels = torch.max(labels, dim=2)[0] # B x Classes if phase == 'train': loss = F.binary_cross_entropy_with_logits( max_frame_logits, labels) writer.add_scalar('loss/train', loss, steps) optimizer.zero_grad() loss.backward() optimizer.step() if steps % 10 == 0: print('Step {} {} loss: {:.4f}'.format( steps, phase, loss)) steps += 1 # metrics for validation pred = (torch.sigmoid(max_frame_logits) >= 0.5).float() # predicted labels for this batch (B x C) if i == 0: all_preds = np.array(pred.tolist()) all_labels = np.array(labels.tolist()) else: all_preds = np.append(all_preds, pred.tolist(), axis=0) all_labels = np.append(all_labels, labels.tolist(), axis=0) # Eval all_APs = [ metrics.average_precision_score(y_true=all_labels[:, j], y_score=all_preds[:, j]) for j in range(157) ] mAP = np.nanmean(all_APs) if phase == 'train': writer.add_scalar('mAP/train', mAP, epoch) print('-' * 50) print('{} mAP: {:.4f}'.format(phase, mAP)) print('-' * 50) save_checkpoint(i3d, optimizer, loss, save_dir, epoch, steps) # save checkpoint after epoch! else: writer.add_scalar('mAP/val', mAP, epoch) print('{} mAP: {:.4f}'.format(phase, mAP)) #lr_sched.step() # step after epoch writer.close()
for i, x in enumerate(X): if int(x[1]) >= 10: new_X.append(x) new_Y.append(Y[i]) new_X = np.array(new_X) new_Y = np.array(new_Y) train_data = VideoDataset(new_X, new_Y) trainloader = DataLoader(train_data, batch_size=batch_size, shuffle=True) load_model = "../input/rgb_charades.pt" save_dir = "../input/i3d" i3d = InceptionI3d(400, in_channels=3) i3d.replace_logits(157) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 # Iterate over data. for data in trainloader: # get the inputs inputs, labels, name = data if os.path.exists(os.path.join(save_dir, name[0]+'.npy')):
def run(max_steps=64e3, mode='flow', root='./frames', split='gt.json', batch_size=1, load_model='', save_dir=''): # setup dataset test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(split, 'training', root, mode, test_transforms, num=-1, save_dir=save_dir) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) val_dataset = Dataset(split, 'test', root, mode, test_transforms, num=-1, save_dir=save_dir) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=8, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(20, in_channels=2) else: i3d = InceptionI3d(20, in_channels=3) i3d.replace_logits(157) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() for phase in ['train', 'val']: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 # Iterate over data. for data in dataloaders[phase]: # get the inputs inputs, labels, name = data # if os.path.exists(os.path.join(save_dir, name[0] + '.npy')): # continue b, c, t, h, w = inputs.shape if t > 16: features = [] for start in range(0, t, 16): end = min(t - 1, start + 16) if end < start + 16: break # start = max(1, start - 48) ip = Variable(torch.from_numpy( inputs.numpy()[:, :, start:end]).cuda(), volatile=True) feature = i3d.extract_features(ip) feature = torch.squeeze(feature) features.append(feature.data.cpu().numpy()) np.save(os.path.join(save_dir, name[0]), np.asarray(features)) else: # wrap them in Variable inputs = Variable(inputs.cuda(), volatile=True) features = i3d.extract_features(inputs) np.save( os.path.join(save_dir, name[0]), features.squeeze(0).permute(1, 2, 3, 0).data.cpu().numpy())
def run(mode='rgb', load_model='', sample_mode='oversample', frequency=16, input_dir='', output_dir='', batch_size=40, usezip=False): chunk_size = 16 assert (mode in ['rgb', 'flow']) assert (sample_mode in ['oversample', 'center_crop', 'resize']) # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) else: i3d = InceptionI3d(400, in_channels=3) #i3d.replace_logits(157) i3d.load_state_dict(torch.load(load_model)) i3d.cuda() i3d.train(False) # Set model to evaluate mode def forward_batch(b_data): b_data = b_data.transpose([0, 4, 1, 2, 3]) b_data = torch.from_numpy(b_data) # b,c,t,h,w # 40x3x16x224x224 b_data = Variable(b_data.cuda(), volatile=True).float() b_features = i3d.extract_features(b_data) b_features = b_features.data.cpu().numpy()[:, :, 0, 0, 0] return b_features video_names = [i for i in os.listdir(input_dir) if i[0] == 'v'] for video_name in video_names: save_file = '{}-{}.npz'.format(video_name, mode) if save_file in os.listdir(output_dir): continue frames_dir = os.path.join(input_dir, video_name) if mode == 'rgb': if usezip: rgb_zipdata = zipfile.ZipFile( os.path.join(frames_dir, 'img.zip'), 'r') rgb_files = [ i for i in rgb_zipdata.namelist() if i.startswith('img') ] else: rgb_files = [ i for i in os.listdir(frames_dir) if i.startswith('img') ] rgb_files.sort() frame_cnt = len(rgb_files) else: if usezip: flow_x_zipdata = zipfile.ZipFile( os.path.join(frames_dir, 'flow_x.zip'), 'r') flow_x_files = [ i for i in flow_x_zipdata.namelist() if i.startswith('x_') ] flow_y_zipdata = zipfile.ZipFile( os.path.join(frames_dir, 'flow_y.zip'), 'r') flow_y_files = [ i for i in flow_y_zipdata.namelist() if i.startswith('y_') ] else: flow_x_files = [ i for i in os.listdir(frames_dir) if i.startswith('flow_x') ] flow_y_files = [ i for i in os.listdir(frames_dir) if i.startswith('flow_y') ] flow_x_files.sort() flow_y_files.sort() assert (len(flow_y_files) == len(flow_x_files)) frame_cnt = len(flow_y_files) # clipped_length = (frame_cnt // chunk_size) * chunk_size # Cut frames # Cut frames assert (frame_cnt > chunk_size) clipped_length = frame_cnt - chunk_size clipped_length = (clipped_length // frequency) * frequency # The start of last chunk frame_indices = [] # Frames to chunks for i in range(clipped_length // frequency + 1): frame_indices.append( [j for j in range(i * frequency, i * frequency + chunk_size)]) frame_indices = np.array(frame_indices) #frame_indices = np.reshape(frame_indices, (-1, 16)) # Frames to chunks chunk_num = frame_indices.shape[0] batch_num = int(np.ceil(chunk_num / batch_size)) # Chunks to batches frame_indices = np.array_split(frame_indices, batch_num, axis=0) if sample_mode == 'oversample': full_features = [[] for i in range(10)] else: full_features = [[]] for batch_id in range(batch_num): require_resize = sample_mode == 'resize' if mode == 'rgb': if usezip: batch_data = load_ziprgb_batch(rgb_zipdata, rgb_files, frame_indices[batch_id], require_resize) else: batch_data = load_rgb_batch(frames_dir, rgb_files, frame_indices[batch_id], require_resize) else: if usezip: batch_data = load_zipflow_batch(flow_x_zipdata, flow_y_zipdata, flow_x_files, flow_y_files, frame_indices[batch_id], require_resize) else: batch_data = load_flow_batch(frames_dir, flow_x_files, flow_y_files, frame_indices[batch_id], require_resize) if sample_mode == 'oversample': batch_data_ten_crop = oversample_data(batch_data) for i in range(10): pdb.set_trace() assert (batch_data_ten_crop[i].shape[-2] == 224) assert (batch_data_ten_crop[i].shape[-3] == 224) full_features[i].append( forward_batch(batch_data_ten_crop[i])) else: if sample_mode == 'center_crop': batch_data = batch_data[:, :, 16:240, 58: 282, :] # Centrer Crop (39, 16, 224, 224, 2) assert (batch_data.shape[-2] == 224) assert (batch_data.shape[-3] == 224) full_features[0].append(forward_batch(batch_data)) full_features = [np.concatenate(i, axis=0) for i in full_features] full_features = [np.expand_dims(i, axis=0) for i in full_features] full_features = np.concatenate(full_features, axis=0) np.savez(os.path.join(output_dir, save_file), feature=full_features, frame_cnt=frame_cnt, video_name=video_name) print('{} done: {} / {}, {}'.format(video_name, frame_cnt, clipped_length, full_features.shape))
def run(configs, mode='rgb', root='/ssd/Charades_v1_rgb', train_split='charades/charades.json', save_model='', num_classes=None, weights=None): print(configs) # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, 'train', root, mode, num_classes=num_classes, transforms=train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=configs.batch_size, shuffle=True, num_workers=4, pin_memory=True) val_dataset = Dataset(train_split, 'test', root, mode, num_classes=num_classes, transforms=test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=configs.batch_size, shuffle=True, num_workers=4, pin_memory=False) dataloaders = {'train': dataloader, 'test': val_dataloader} datasets = {'train': dataset, 'test': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) i3d.load_state_dict(torch.load('weights/flow_imagenet.pt')) else: i3d = InceptionI3d(400, in_channels=3) i3d.load_state_dict(torch.load('weights/rgb_imagenet.pt')) num_classes = dataset.num_classes i3d.replace_logits(num_classes) if weights: print('loading weights {}'.format(weights)) i3d.load_state_dict(torch.load(weights)) i3d.cuda() i3d = nn.DataParallel(i3d) lr = configs.init_lr weight_decay = configs.adam_weight_decay optimizer = optim.Adam(i3d.parameters(), lr=lr, weight_decay=weight_decay) num_steps_per_update = configs.update_per_step # accum gradient steps = 0 epoch = 0 best_val_score = 0 # train it scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.3) while steps < configs.max_steps and epoch < 400: # for epoch in range(num_epochs): print('Step {}/{}'.format(steps, configs.max_steps)) print('-' * 10) epoch += 1 # Each epoch has a training and validation phase for phase in ['train', 'test']: collected_vids = [] if phase == 'train': i3d.train(True) else: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 optimizer.zero_grad() confusion_matrix = np.zeros((num_classes, num_classes), dtype=np.int) # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs if data == -1: # bracewell does not compile opencv with ffmpeg, strange errors occur resulting in no video loaded continue # inputs, labels, vid, src = data inputs, labels, vid = data # wrap them in Variable inputs = inputs.cuda() t = inputs.size(2) labels = labels.cuda() per_frame_logits = i3d(inputs, pretrained=False) # upsample to input size per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') # compute localization loss loc_loss = F.binary_cross_entropy_with_logits( per_frame_logits, labels) tot_loc_loss += loc_loss.data.item() predictions = torch.max(per_frame_logits, dim=2)[0] gt = torch.max(labels, dim=2)[0] # compute classification loss (with max-pooling along time B x C x T) cls_loss = F.binary_cross_entropy_with_logits( torch.max(per_frame_logits, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss += cls_loss.data.item() for i in range(per_frame_logits.shape[0]): confusion_matrix[torch.argmax(gt[i]).item(), torch.argmax(predictions[i]).item()] += 1 loss = (0.5 * loc_loss + 0.5 * cls_loss) / num_steps_per_update tot_loss += loss.data.item() if num_iter == num_steps_per_update // 2: print(epoch, steps, loss.data.item()) loss.backward() if num_iter == num_steps_per_update and phase == 'train': steps += 1 num_iter = 0 optimizer.step() optimizer.zero_grad() # lr_sched.step() if steps % 10 == 0: acc = float(np.trace(confusion_matrix)) / np.sum( confusion_matrix) print( 'Epoch {} {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}' .format(epoch, phase, tot_loc_loss / (10 * num_steps_per_update), tot_cls_loss / (10 * num_steps_per_update), tot_loss / 10, acc)) tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'test': val_score = float( np.trace(confusion_matrix)) / np.sum(confusion_matrix) if val_score > best_val_score or epoch % 2 == 0: best_val_score = val_score model_name = save_model + "nslt_" + str( num_classes) + "_" + str(steps).zfill( 6) + '_%3f.pt' % val_score torch.save(i3d.module.state_dict(), model_name) print(model_name) print( 'VALIDATION: {} Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f} Accu :{:.4f}' .format(phase, tot_loc_loss / num_iter, tot_cls_loss / num_iter, (tot_loss * num_steps_per_update) / num_iter, val_score)) scheduler.step(tot_loss * num_steps_per_update / num_iter)
def run(init_lr=0.001, max_steps=20, mode='rgb', root='/proxy/', train_split='./scott.txt', test_split="./scottt.txt", batch_size=8 * 5, save_model='nope'): # This table contains the distance between two possible ordering sequences # It is therefore a 120*120 table distance_dict = np.load("distance_dict.npy") distance_dict = torch.from_numpy(distance_dict).float().cuda() root = "./proxy/" dataset = Dataset( train_split, root, mode, ) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True) val_dataset = Dataset(test_split, root, mode) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model if mode == 'flow': i3d = InceptionI3d(400, in_channels=2) #Imagenet Pretraining i3d.load_state_dict(torch.load('models/flow_imagenet.pt')) else: #You can modify the number of outputs in the file Siamese_I3D.py i3d = ProxyNetwork() i3d.cuda() i3d = nn.DataParallel(i3d) lr = init_lr optimizer = optim.SGD(i3d.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000]) num_steps_per_update = 1 # accum gradient steps = 0 # train it while steps < max_steps: #for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) t1 = time.time() processed_elements = 0 # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': i3d.train(True) else: i3d.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: processed_elements += 40 # get the inputs inputs, labels = data # wrap them in Variable inputs = Variable(inputs.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) #Custom loss implementation # Depending on the "real" labels per_frame_logits = i3d(inputs) for i in range(labels.shape[0]): #print(i) per_frame_logits[i] *= distance_dict[labels[i][0][0]] # upsample to input size #per_frame_logits = F.upsample(per_frame_logits, t, mode='linear') per_frame_logits = per_frame_logits.squeeze() labels = labels.squeeze() labels = labels.type(torch.LongTensor) labels = labels.cuda() # compute localization loss loc_loss = F.cross_entropy(per_frame_logits, labels) tot_loc_loss += loc_loss.item() #Class loss loss = loc_loss / num_steps_per_update tot_loss += loss.item() loss.backward() # 10800 is the number of elements in the training set len_training_set = 10800 print("processed elements : " + str(processed_elements) + " / " + str(len_training_set)) print(time.time() - t1) if phase == 'train': steps += 1 optimizer.step() optimizer.zero_grad() lr_sched.step() if steps % 1 == 0: print( '{} Train Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}' .format(phase, tot_loc_loss / (10 * num_steps_per_update), tot_cls_loss / (10 * num_steps_per_update), tot_loss / 10)) # save model torch.save(i3d, "customloss" + str(steps) + '.pt') tot_loss = tot_loc_loss = tot_cls_loss = 0. if phase == 'val': print( '{} Val Loc Loss: {:.4f} Cls Loss: {:.4f} Tot Loss: {:.4f}' .format(phase, tot_loc_loss, tot_cls_loss, (tot_loss * num_steps_per_update)))
def run(init_lr=0.01, max_steps=200, mode='rgb', root='/media/pritesh/Entertainment/Visual-Tactile_Dataset/dataset/',\ train_split='train.txt', test_split='test.txt', batch_size=5, save_model=''): writer = tensorboardX.SummaryWriter() # setup dataset train_transforms = transforms.Compose([ videotransforms.RandomCrop(224), videotransforms.RandomHorizontalFlip(), ]) test_transforms = transforms.Compose([videotransforms.CenterCrop(224)]) dataset = Dataset(train_split, root, mode, train_transforms) dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=3, pin_memory=True) val_dataset = Dataset(test_split, root, mode, test_transforms) val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=3, pin_memory=True) dataloaders = {'train': dataloader, 'val': val_dataloader} datasets = {'train': dataset, 'val': val_dataset} # setup the model sm = InceptionI3d(400, in_channels=3) sm.load_state_dict(torch.load('models/rgb_imagenet.pt')) #tm = InceptionI3d(400, in_channels=2) #tm.load_state_dict(torch.load('models/flow_imagenet.pt')) sm.replace_logits(1) sm = freeze_network_layer(sm) #add your network here fusedNet = FusionNet(sm) if torch.cuda.is_available(): fusedNet.cuda() fusedNet = nn.DataParallel(fusedNet) lr = init_lr optimizer = optim.SGD(fusedNet.parameters(), lr=lr, momentum=0.9, weight_decay=0.0000001) lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [50, 100, 150, 200]) steps = 0 with open('i3d_video.txt', 'w') as file: file.write("train and validation loss file\n") # train it while steps < max_steps: #for epoch in range(num_epochs): print('Step {}/{}'.format(steps, max_steps)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: print('phase : {}'.format(phase)) if phase == 'train': fusedNet.train(True) else: fusedNet.train(False) # Set model to evaluate mode tot_loss = 0.0 tot_loc_loss = 0.0 tot_cls_loss = 0.0 num_iter = 0 count = 0 optimizer.zero_grad() # Iterate over data. for data in dataloaders[phase]: num_iter += 1 # get the inputs f_vid, l_vid, tactile, pos, labels = data if torch.cuda.is_available(): inputs = Variable(f_vid.cuda()) t = inputs.size(2) labels = Variable(labels.cuda()) else: inputs = Variable(f_vid) t = inputs.size(2) labels = Variable(labels) per_frame_logits = fusedNet(inputs.float()) #print('prediction output = ', per_frame_logits.shape) #print('labels = ',labels.shape) # compute classification loss (with max-pooling along time B x C x T) per_frame_logits = per_frame_logits.squeeze(1) cls_loss = F.binary_cross_entropy_with_logits( per_frame_logits.double(), labels.double()) tot_cls_loss += cls_loss.item() cls_loss.backward() print('{} Loss: {:.4f} and lr: {}'.format( phase, tot_cls_loss / num_iter, init_lr)) with open('i3d_video.txt', 'a') as file: file.write("%f\n" % (tot_cls_loss / num_iter)) optimizer.step() optimizer.zero_grad() if phase == 'val': writer.add_scalar('error/' + phase, (tot_cls_loss / num_iter), num_iter) else: writer.add_scalar('error/' + phase, (tot_cls_loss / num_iter), num_iter) if (steps % 50 == 0): torch.save( fusedNet.module.state_dict(), save_model + phase + str(steps).zfill(6) + '.pt') save_checkpoint(fusedNet, optimizer, lr_sched, steps) #save error at every epoch writer.add_scalar('errorAtEpoch/' + phase, (tot_cls_loss / num_iter), steps) tot_cls_loss = 0. #if(steps%50 == 0): # torch.save(fusedNet.module.state_dict(), save_model+phase+str(steps).zfill(6)+'.pt') # save_checkpoint(fusedNet, optimizer, lr_sched, steps) steps += 1 lr_sched.step()
transforms=test_transforms, stride=8, fm_us=64) Video_loader = torch.utils.data.DataLoader(val_video_data, batch_size=args.batch_size, shuffle=False, num_workers=data_workers, pin_memory=True) print('Find {} train samples '.format(len(val_video_data))) for i in args.__dict__.keys(): print(i, ':\t', args.__dict__[i]) I3D = InceptionI3d(400, in_channels=3, dropout_keep_prob=0) I3D.replace_logits(157) # I3D = nn.DataParallel(I3D).cuda() I3D.load_state_dict(torch.load('../model/rgb_charades.pt')) I3D = nn.DataParallel(I3D).cuda() # I3D.load_state_dict(torch.load(args.model_pth)) VD_pred = vid_map(Video_loader, I3D, epoch, print_freq=10) # output = [] # gt = [] Map = winsmooth(VD_pred) mAP, _, ap = map.charades_map(np.vstack(Map[0]), np.vstack(Map[1])) print('The final mAp is:', mAP) submission_file(