def validate(val_list, model, criterion): print("begin test") test_loader = listDataset( val_list, shuffle=True, transform=ST.Compose([ ST.ToNumpy(), ST.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]), train=True, seen=model.seen, batch_size=args.batch_size, num_workers=args.workers, ) model.eval() mae = 0 for i, (img, target) in enumerate(test_loader): img = flow.Tensor(img, dtype=flow.float32, device="cuda") with flow.no_grad(): output = model(img).to("cuda") mae += abs(output.data.sum().numpy() - target.sum()) mae = mae / len(test_loader) print(" * MAE {mae:.3f} ".format(mae=mae)) return mae
def main(): transform = ST.Compose( [ ST.ToNumpyForVal(), ST.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ] ) global args args = parser.parse_args() model = CSRNet() model = model.to("cuda") # checkpoint = flow.load('checkpoint/Shanghai_BestModelA/shanghaiA_bestmodel') checkpoint = flow.load(args.modelPath) model.load_state_dict(checkpoint) img = transform(Image.open(args.picPath).convert("RGB")) img = flow.Tensor(img) img = img.to("cuda") output = model(img.unsqueeze(0)) print("Predicted Count : ", int(output.detach().to("cpu").sum().numpy())) temp = output.view(output.shape[2], output.shape[3]) temp = temp.numpy() plt.title("Predicted Count") plt.imshow(temp, cmap=c.jet) plt.show() temp = h5py.File(args.picDensity, "r") temp_1 = np.asarray(temp["density"]) plt.title("Original Count") plt.imshow(temp_1, cmap=c.jet) print("Original Count : ", int(np.sum(temp_1)) + 1) plt.show() print("Original Image") plt.title("Original Image") plt.imshow(plt.imread(args.picPath)) plt.show()
def main(): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu use_gpu = torch.cuda.is_available() sys.stdout = Logger(osp.join(args.resume, 'log_test.txt')) print("==========\nArgs:{}\n==========".format(args)) print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_dataset(name=args.dataset, root=args.root) # Data augmentation spatial_transform_test = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_test = None pin_memory = True if use_gpu else False queryloader = DataLoader(VideoDataset( dataset.query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=1, shuffle=False, num_workers=0, pin_memory=pin_memory, drop_last=False) galleryloader = DataLoader(VideoDataset( dataset.gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=1, shuffle=False, num_workers=0, pin_memory=pin_memory, drop_last=False) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, num_classes=dataset.num_train_pids) print("Model size: {:.5f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) for epoch in args.test_epochs: model_path = osp.join(args.resume, 'checkpoint_ep' + str(epoch) + '.pth.tar') print("Loading checkpoint from '{}'".format(model_path)) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) if use_gpu: model = model.cuda() print("Evaluate") with torch.no_grad(): test(model, queryloader, galleryloader, use_gpu)
def getDataSets(dataset): spatial_transform_test = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.ToNumpy(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_test = TT.TemporalBeginCrop() queryset = VideoDataset(dataset.query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test) galleryset =VideoDataset(dataset.gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test) return queryset,galleryset
def getDataSets(dataset): spatial_transform_train = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.RandomHorizontalFlip(), ST.ToNumpy(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_train = TT.TemporalRandomCrop(size=args.seq_len, stride=args.sample_stride) spatial_transform_test = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.ToNumpy(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_test = TT.TemporalBeginCrop() if args.dataset != 'mars': trainset = VideoDataset(dataset.train_dense, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train) else: trainset = VideoDataset(dataset.train, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train) queryset = VideoDataset(dataset.query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test) galleryset =VideoDataset(dataset.gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test) return trainset,queryset,galleryset
def train(train_list, model, criterion, optimizer, epoch): losses = AverageMeter() batch_time = AverageMeter() data_time = AverageMeter() train_loader = listDataset( train_list, shuffle=True, transform=ST.Compose([ ST.ToNumpy(), ST.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]), train=True, seen=model.seen, batch_size=args.batch_size, num_workers=args.workers, ) model.train() end = time.time() for i, (img, target) in enumerate(train_loader): data_time.update(time.time() - end) img = flow.Tensor(img, dtype=flow.float32, device="cuda") output = model(img).to("cuda") output = flow.Tensor(output, device="cuda") target = flow.Tensor(target, device="cuda").unsqueeze(0) loss = criterion(output, target) losses.update(loss.numpy(), img.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() if i % args.print_freq == 0: print("Epoch: [{0}][{1}/{2}]\t" "Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t" "Data {data_time.val:.3f} ({data_time.avg:.3f})\t" "Loss {loss.val:.4f} ({loss.avg:.4f})\t".format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, ))
def main(): transform = ST.Compose([ ST.ToNumpyForVal(), ST.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) global args args = parser.parse_args() root = "./dataset/" # now generate the ShanghaiA's ground truth part_A_train = os.path.join(root, "part_A_final/train_data", "images") part_A_test = os.path.join(root, "part_A_final/test_data", "images") part_B_train = os.path.join(root, "part_B_final/train_data", "images") part_B_test = os.path.join(root, "part_B_final/test_data", "images") path_sets = [] if args.picSrc == "part_A_test": path_sets = [part_A_test] elif args.picSrc == "part_B_test": path_sets = [part_B_test] img_paths = [] for path in path_sets: for img_path in glob.glob(os.path.join(path, "*.jpg")): img_paths.append(img_path) model = CSRNet() model = model.to("cuda") checkpoint = flow.load(args.modelPath) model.load_state_dict(checkpoint) MAE = [] for i in range(len(img_paths)): img = transform(Image.open(img_paths[i]).convert("RGB")) img = np.asarray(img).astype(np.float32) img = flow.Tensor(img, dtype=flow.float32, device="cuda") img = img.to("cuda") gt_file = h5py.File( img_paths[i].replace(".jpg", ".h5").replace("images", "ground_truth"), "r") groundtruth = np.asarray(gt_file["density"]) with flow.no_grad(): output = model(img.unsqueeze(0)) mae = abs(output.sum().numpy() - np.sum(groundtruth)) MAE.append(mae) avg_MAE = sum(MAE) / len(MAE) print("test result: MAE:{:2f}".format(avg_MAE))
def main(): torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False # set a learning rate if args.lr_factor == -1: args.lr_factor = random() args.lr = args.lr_factor * 10**-args.lr_base #print(f"Choose learning rate {args.lr}") sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt'), mode='a') print("==========\nArgs:{}\n==========".format(args)) #assert torch.distributed.is_available() #print("Initializing DDP by nccl-tcp({}) rank({}) world_size({})".format(args.init_method, args.rank, args.world_size)) #dist.init_process_group(backend='nccl', init_method=args.init_method, rank=args.rank, world_size=args.world_size) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_dataset(name=args.dataset, root=args.root) # Data augmentation spatial_transform_train = [ ST.Scale((args.height, args.width), interpolation=3), ST.RandomHorizontalFlip(), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ] spatial_transform_train = ST.Compose(spatial_transform_train) temporal_transform_train = TT.TemporalRandomCrop(size=args.seq_len, stride=args.sample_stride) #temporal_transform_train = TT.TemporalRandomCropPick(size=args.seq_len, stride=args.sample_stride) spatial_transform_test = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_test = TT.TemporalBeginCrop(size=args.test_frames) pin_memory = True if use_gpu else False dataset_train = dataset.train if args.dataset == 'duke': dataset_train = dataset.train_dense print('process duke dataset') #sampler = RandomIdentitySampler(dataset_train, num_instances=args.num_instances) if args.dataset == 'lsvid': sampler = RandomIdentitySampler(dataset_train, num_instances=args.num_instances) elif args.dataset == 'mars': sampler = RandomIdentitySampler(dataset_train, num_instances=args.num_instances) trainloader = DataLoader( VideoDataset(dataset_train, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train), sampler=sampler, batch_size=args.train_batch, num_workers=args.workers, pin_memory=pin_memory, drop_last=True, ) ''' for batch_idx, (vids, pids, camids, img_paths) in enumerate(trainloader): print(batch_idx, pids, camids, img_paths) break return ''' dataset_query = dataset.query dataset_gallery = dataset.gallery if args.dataset == 'lsvid': dataset_query = dataset.val_query dataset_gallery = dataset.val_gallery print('process lsvid dataset') queryloader = DataLoader( VideoDataset(dataset_query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False ) galleryloader = DataLoader( VideoDataset(dataset_gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False ) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, use_gpu=use_gpu, num_classes=dataset.num_train_pids, loss={'xent', 'htri'}) #print(model) if args.resume: print("Loading checkpoint from '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) criterion_xent = nn.CrossEntropyLoss() criterion_htri = TripletLoss(margin=args.margin, distance=args.distance, use_gpu=use_gpu) criterion_htri_c = TripletInterCamLoss(margin=args.margin, distance=args.distance, use_gpu=use_gpu) #criterion_htri_c = TripletWeightedInterCamLoss(margin=args.margin, distance=args.distance, use_gpu=use_gpu, alpha=args.cam_alpha) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.stepsize, gamma=args.gamma) start_epoch = args.start_epoch if use_gpu: model = nn.DataParallel(model).cuda() #model = model.cuda() #model = nn.parallel.DistributedDataParallel(model) start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") for epoch in range(start_epoch, args.max_epoch): #print("Set sampler seed to {}".format(args.seed*epoch)) #sampler.set_seed(args.seed*epoch) start_train_time = time.time() train(epoch, model, criterion_xent, criterion_htri, criterion_htri_c, optimizer, trainloader, use_gpu) train_time += round(time.time() - start_train_time) scheduler.step() if (epoch+1) >= args.start_eval and (epoch+1) % args.eval_step == 0 or epoch == 0: print("==> Test") with torch.no_grad(): rank1 = test(model, queryloader, galleryloader, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint({ 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch+1) + '.pth.tar')) print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format(best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print("Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.".format(elapsed, train_time))
try: clip = torch.stack(clip, 0).permute(1, 0, 2, 3) assert clip.size(1) == self.sample_duration except: print("stack failed or clip is not right size", flush=True) print(len(clip), flush=True) print([cl.size() for cl in clip], flush=True) print(id, flush=True) clips.append(clip) return { 'id': id, 'duration': duration, 'timestamps': timestamps, 'fps': fps, 'clips': clips } def __len__(self): return self.len if __name__ == '__main__': sp = spt.Compose([spt.CornerCrop(size=224), spt.ToTensor()]) tp = tpt.Compose([tpt.TemporalRandomCrop(16), tpt.LoopPadding(16)]) dset = ActivityNetCaptions_Train('/ssd1/dsets/activitynet_captions', spatial_transform=sp, temporal_transform=tp) print(dset[0][0].size()) print(dset[0][1])
def main(): torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False # set a learning rate #if args.lr_factor == -1: # args.lr_factor = random() #args.lr = args.lr_factor * 10**-args.lr_base #print(f"Choose learning rate {args.lr}") sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt'), mode='a') print("==========\nArgs:{}\n==========".format(args)) #assert torch.distributed.is_available() #print("Initializing DDP by nccl-tcp({}) rank({}) world_size({})".format(args.init_method, args.rank, args.world_size)) #dist.init_process_group(backend='nccl', init_method=args.init_method, rank=args.rank, world_size=args.world_size) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_dataset(name=args.dataset, root=args.root) # Data augmentation spatial_transform_train = [ ST.Scale((args.height, args.width), interpolation=3), ST.RandomHorizontalFlip(), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ] spatial_transform_train = ST.Compose(spatial_transform_train) temporal_transform_train = TT.TemporalRandomCrop(size=args.seq_len, stride=args.sample_stride) #temporal_transform_train = TT.TemporalRandomCropPick(size=args.seq_len, stride=args.sample_stride) spatial_transform_test = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_test = TT.TemporalBeginCrop(size=args.test_frames) pin_memory = True if use_gpu else False dataset_train = dataset.train if args.dataset == 'duke': dataset_train = dataset.train_dense print('process duke dataset') #sampler = RandomIdentitySampler(dataset_train, num_instances=args.num_instances) if args.dataset == 'lsvid': sampler = RandomIdentityCameraSampler(dataset_train, num_instances=args.num_instances, num_cam=15) elif args.dataset == 'mars': sampler = RandomIdentityCameraSampler(dataset_train, num_instances=args.num_instances, num_cam=6) trainloader = DataLoader( VideoDataset(dataset_train, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train), sampler=sampler, batch_size=args.train_batch, num_workers=args.workers, pin_memory=pin_memory, drop_last=True, ) ''' for batch_idx, (vids, pids, camids, img_paths) in enumerate(trainloader): print(batch_idx, pids, camids, img_paths) break return ''' dataset_query = dataset.query dataset_gallery = dataset.gallery if args.dataset == 'lsvid': dataset_query = dataset.val_query dataset_gallery = dataset.val_gallery print('process lsvid dataset') queryloader = DataLoader( VideoDataset(dataset_query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False ) galleryloader = DataLoader( VideoDataset(dataset_gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False ) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, use_gpu=use_gpu, num_classes=dataset.num_train_pids, loss={'xent', 'htri'}, transformer_num_heads=args.transformer_num_heads, transformer_num_layers=args.transformer_num_layers, attention_flatness=True) #print(model) if args.resume: print("Loading checkpoint from '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) criterion_xent = nn.CrossEntropyLoss() criterion_flat = FlatnessLoss(reduction='batchmean', use_gpu=use_gpu) criterion_htri_c = TripletInterCamLoss(margin=args.margin, distance=args.distance, use_gpu=use_gpu) #criterion_htri_c = TripletWeightedInterCamLoss(margin=args.margin, distance=args.distance, use_gpu=use_gpu, alpha=args.cam_alpha) #optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) linear_scaled_lr = args.lr * args.train_batch * len(args.gpu_devices.split(',')) / 512.0 args.lr = linear_scaled_lr
args = parse_args() print(args) # load vocabulary vocab = Vocabulary(token_level=args.token_level) vocpath = os.path.join(args.root_path, args.vocabpath) try: assert os.path.exists(vocpath) except AssertionError: print("didn't find vocab in {}! aborting".format(vocpath)) sys.exit(0) vocab.load(vocpath) vocab_size = len(vocab) # transforms sp = spt.Compose([spt.CornerCrop(size=args.imsize), spt.ToTensor()]) tp = tpt.Compose([tpt.TemporalRandomCrop(args.clip_len), tpt.LoopPadding(args.clip_len)]) # dataloading collatefn = functools.partial(collater, args.max_seqlen) dset = ActivityNetCaptions(args.root_path, args.meta_path, args.mode, vocab, args.framepath, spatial_transform=sp, temporal_transform=tp, sample_duration=args.clip_len) dloader = DataLoader(dset, batch_size=args.batch_size, shuffle=True, num_workers=args.n_cpu, collate_fn=collatefn, drop_last=True) max_it = int(len(dset) / args.batch_size) # models video_encoder, params = generate_model(args) # rewrite part of average pooling if args.langmethod == 'Transformer': scale = 16 inter_time = int(args.clip_len/scale) video_encoder.avgpool = nn.AdaptiveAvgPool3d((inter_time, 1, 1))
def main(): torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if not args.evaluate: sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt')) else: sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt')) print("==========\nArgs:{}\n==========".format(args)) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_dataset(name=args.dataset, root=args.root) # Data augmentation spatial_transform_train = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.RandomHorizontalFlip(), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_train = TT.TemporalRandomCrop(size=args.seq_len, stride=args.sample_stride) spatial_transform_test = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_test = TT.TemporalBeginCrop() transform_test_img = T.Compose([ T.Resize((args.height, args.width), interpolation=3), T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) pin_memory = True if use_gpu else False if args.dataset == 'dukevid': trainloader = DataLoader( VideoDataset(dataset.train_dense, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train), sampler=RandomIdentitySampler(dataset.train_dense, num_instances=args.num_instances), batch_size=args.train_batch, num_workers=args.workers, pin_memory=pin_memory, drop_last=True, ) else: trainloader = DataLoader( VideoDataset(dataset.train, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train), sampler=RandomIdentitySampler(dataset.train, num_instances=args.num_instances), batch_size=args.train_batch, num_workers=args.workers, pin_memory=pin_memory, drop_last=True, ) queryloader = DataLoader(VideoDataset( dataset.query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=args.test_batch, shuffle=False, num_workers=0, pin_memory=pin_memory, drop_last=False) galleryloader = DataLoader(VideoDataset( dataset.gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=args.test_batch, shuffle=False, num_workers=0, pin_memory=pin_memory, drop_last=False) queryimgloader = DataLoader(ImageDataset(dataset.query_img, transform=transform_test_img), batch_size=args.img_test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False) galleryimgloader = DataLoader(ImageDataset(dataset.gallery_img, transform=transform_test_img), batch_size=args.img_test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False) print("Initializing model: {} and {}".format(args.vid_arch, args.img_arch)) vid_model = models.init_model(name=args.vid_arch) img_model = models.init_model(name=args.img_arch) classifier = models.init_model(name='classifier', num_classes=dataset.num_train_pids) print("Video model size: {:.5f}M".format( sum(p.numel() for p in vid_model.parameters()) / 1000000.0)) print("Image model size: {:.5f}M".format( sum(p.numel() for p in img_model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() criterion_tkp_f = FeatureBasedTKP(bp_to_vid=args.bp_to_vid) criterion_tkp_d = SimilarityBasedTKP(distance='euclidean', bp_to_vid=args.bp_to_vid) criterion_i2v = HeterogeneousTripletLoss(margin=0.3, distance='euclidean') optimizer = torch.optim.Adam([{ 'params': vid_model.parameters(), 'lr': args.lr }, { 'params': img_model.parameters(), 'lr': args.lr }, { 'params': classifier.parameters(), 'lr': args.lr }], weight_decay=args.weight_decay) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.stepsize, gamma=args.gamma) start_epoch = args.start_epoch if args.resume: print("Loading checkpoint from '{}'".format(args.resume)) checkpoint = torch.load(args.resume) vid_model.load_state_dict(checkpoint['vid_model_state_dict']) img_model.load_state_dict(checkpoint['img_model_state_dict']) classifier.load_state_dict(checkpoint['classifier_state_dict']) start_epoch = checkpoint['epoch'] if use_gpu: vid_model = vid_model.cuda() img_model = img_model.cuda() classifier = classifier.cuda() if args.evaluate: print("Evaluate only") with torch.no_grad(): test(vid_model, img_model, queryloader, galleryloader, queryimgloader, galleryimgloader, use_gpu) return start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") for epoch in range(start_epoch, args.max_epoch): scheduler.step() start_train_time = time.time() train(epoch, vid_model, img_model, classifier, criterion, criterion_tkp_f, criterion_tkp_d, criterion_i2v, optimizer, trainloader, use_gpu) torch.cuda.empty_cache() train_time += round(time.time() - start_train_time) if (epoch + 1) >= args.start_eval and args.eval_step > 0 and ( epoch + 1) % args.eval_step == 0 or (epoch + 1) == args.max_epoch: print("==> Test") with torch.no_grad(): rank1 = test(vid_model, img_model, queryloader, galleryloader, queryimgloader, galleryimgloader, use_gpu) torch.cuda.empty_cache() is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 vid_model_state_dict = vid_model.state_dict() img_model_state_dict = img_model.state_dict() classifier_state_dict = classifier.state_dict() save_checkpoint( { 'vid_model_state_dict': vid_model_state_dict, 'img_model_state_dict': img_model_state_dict, 'classifier_state_dict': classifier_state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format( best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time))
def main(): torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False sys.stdout = Logger(osp.join(args.save_dir, 'log_train.txt'), mode='a') print("==========\nArgs:{}\n==========".format(args)) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_dataset(name=args.dataset, root=args.root) # Data augmentation spatial_transform_train = [ ST.Scale((args.height, args.width), interpolation=3), ST.RandomHorizontalFlip(), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ] spatial_transform_train = ST.Compose(spatial_transform_train) temporal_transform_train = TT.TemporalRandomCrop(size=args.seq_len, stride=args.sample_stride) spatial_transform_test = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_test = TT.TemporalBeginCrop(size=args.test_frames) pin_memory = True if use_gpu else False dataset_train = dataset.train if args.dataset == 'duke': dataset_train = dataset.train_dense print('process duke dataset') trainloader = DataLoader( VideoDataset(dataset_train, spatial_transform=spatial_transform_train, temporal_transform=temporal_transform_train), sampler=RandomIdentitySampler(dataset_train, num_instances=args.num_instances), batch_size=args.train_batch, num_workers=args.workers, pin_memory=pin_memory, drop_last=True, ) queryloader = DataLoader(VideoDataset( dataset.query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False) galleryloader = DataLoader(VideoDataset( dataset.gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, use_gpu=use_gpu, num_classes=dataset.num_train_pids, loss={'xent', 'htri'}) print(model) criterion_xent = nn.CrossEntropyLoss() criterion_htri = TripletLoss(margin=args.margin, distance=args.distance, use_gpu=use_gpu) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=args.stepsize, gamma=args.gamma) start_epoch = args.start_epoch if use_gpu: model = nn.DataParallel(model).cuda() start_time = time.time() train_time = 0 best_rank1 = -np.inf best_epoch = 0 print("==> Start training") for epoch in range(start_epoch, args.max_epoch): start_train_time = time.time() train(epoch, model, criterion_xent, criterion_htri, optimizer, trainloader, use_gpu) train_time += round(time.time() - start_train_time) scheduler.step() if (epoch + 1) >= args.start_eval and ( epoch + 1) % args.eval_step == 0 or epoch == 0: print("==> Test") with torch.no_grad(): rank1 = test(model, queryloader, galleryloader, use_gpu) is_best = rank1 > best_rank1 if is_best: best_rank1 = rank1 best_epoch = epoch + 1 if use_gpu: state_dict = model.module.state_dict() else: state_dict = model.state_dict() save_checkpoint( { 'state_dict': state_dict, 'rank1': rank1, 'epoch': epoch, }, is_best, osp.join(args.save_dir, 'checkpoint_ep' + str(epoch + 1) + '.pth.tar')) print("==> Best Rank-1 {:.1%}, achieved at epoch {}".format( best_rank1, best_epoch)) elapsed = round(time.time() - start_time) elapsed = str(datetime.timedelta(seconds=elapsed)) train_time = str(datetime.timedelta(seconds=train_time)) print( "Finished. Total elapsed time (h:m:s): {}. Training time (h:m:s): {}.". format(elapsed, train_time))
def main(): torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices use_gpu = torch.cuda.is_available() if args.use_cpu: use_gpu = False sys.stdout = Logger(osp.join(args.save_dir, 'log_test1.txt'), mode='a') print("==========\nArgs:{}\n==========".format(args)) if use_gpu: print("Currently using GPU {}".format(args.gpu_devices)) torch.cuda.manual_seed_all(args.seed) else: print("Currently using CPU (GPU is highly recommended)") print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_dataset(name=args.dataset) # Data augmentation spatial_transform_test = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_test = None pin_memory = True if use_gpu else False queryloader = DataLoader(VideoDataset( dataset.query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=1, shuffle=False, num_workers=0, pin_memory=pin_memory, drop_last=False) galleryloader = DataLoader(VideoDataset( dataset.gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=1, shuffle=False, num_workers=0, pin_memory=pin_memory, drop_last=False) print("Initializing model: {}".format(args.arch)) model = models.init_model(name=args.arch, use_gpu=use_gpu, num_classes=dataset.num_train_pids, loss={'xent', 'htri'}) if args.resume: print("Loading checkpoint from '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model.load_state_dict(checkpoint['state_dict']) if use_gpu: model = nn.DataParallel(model).cuda() model.eval() with torch.no_grad(): evaluation(model, args, queryloader, galleryloader, use_gpu)
def main(): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_devices print(torch.cuda.device_count()) use_gpu = torch.cuda.is_available() sys.stdout = Logger(osp.join(args.save_dir, 'log_test.txt')) print("==========\nArgs:{}\n==========".format(args)) print("Initializing dataset {}".format(args.dataset)) dataset = data_manager.init_dataset(name=args.dataset, root=args.root) # Data augmentation spatial_transform_test = ST.Compose([ ST.Scale((args.height, args.width), interpolation=3), ST.ToTensor(), ST.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) temporal_transform_test = None transform_test_img = T.Compose([ T.Resize((args.height, args.width), interpolation=3), T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) pin_memory = True if use_gpu else False queryloader = DataLoader(VideoDataset( dataset.query, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=args.test_batch, shuffle=False, num_workers=0, pin_memory=pin_memory, drop_last=False) galleryloader = DataLoader(VideoDataset( dataset.gallery, spatial_transform=spatial_transform_test, temporal_transform=temporal_transform_test), batch_size=args.test_batch, shuffle=False, num_workers=0, pin_memory=pin_memory, drop_last=False) queryimgloader = DataLoader(ImageDataset(dataset.query_img, transform=transform_test_img), batch_size=args.img_test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False) galleryimgloader = DataLoader(ImageDataset(dataset.gallery_img, transform=transform_test_img), batch_size=args.img_test_batch, shuffle=False, num_workers=args.workers, pin_memory=pin_memory, drop_last=False) print("Initializing model: {} and {}".format(args.vid_arch, args.img_arch)) vid_model = models.init_model(name=args.vid_arch) img_model = models.init_model(name=args.img_arch) print("Video model size: {:.5f}M".format( sum(p.numel() for p in vid_model.parameters()) / 1000000.0)) print("Image model size: {:.5f}M".format( sum(p.numel() for p in img_model.parameters()) / 1000000.0)) print("Loading checkpoint from '{}'".format(args.resume)) checkpoint = torch.load(args.resume) vid_model.load_state_dict(checkpoint['vid_model_state_dict']) img_model.load_state_dict(checkpoint['img_model_state_dict']) if use_gpu: vid_model = vid_model.cuda() img_model = img_model.cuda() print("Evaluate") with torch.no_grad(): test(vid_model, img_model, queryloader, galleryloader, queryimgloader, galleryimgloader, use_gpu)
def train_lstm(args): # gpus device = torch.device( 'cuda' if args.cuda and torch.cuda.is_available() else 'cpu') # load vocabulary annfiles = [os.path.join(args.root_path, pth) for pth in args.annpaths] text_proc = build_vocab(annfiles, args.min_freq, args.max_seqlen) vocab_size = len(text_proc.vocab) # transforms sp = spt.Compose([spt.CornerCrop(size=args.imsize), spt.ToTensor()]) tp = tpt.Compose([ tpt.TemporalRandomCrop(args.clip_len), tpt.LoopPadding(args.clip_len) ]) # dataloading train_dset = ActivityNetCaptions_Train(args.root_path, ann_path='train_fps.json', sample_duration=args.clip_len, spatial_transform=sp, temporal_transform=tp) trainloader = DataLoader(train_dset, batch_size=args.batch_size, shuffle=True, num_workers=args.n_cpu, drop_last=True, timeout=100) max_train_it = int(len(train_dset) / args.batch_size) val_dset = ActivityNetCaptions_Val( args.root_path, ann_path=['val_1_fps.json', 'val_2_fps.json'], sample_duration=args.clip_len, spatial_transform=sp, temporal_transform=tp) valloader = DataLoader(val_dset, batch_size=args.batch_size, shuffle=True, num_workers=args.n_cpu, drop_last=True, timeout=100) #max_val_it = int(len(val_dset) / args.batch_size) max_val_it = 10 # models video_encoder = generate_3dcnn(args) caption_gen = generate_rnn(vocab_size, args) models = [video_encoder, caption_gen] # initialize pretrained embeddings if args.emb_init is not None: begin = time.time() print("initializing embeddings from {}...".format(args.emb_init)) lookup = get_pretrained_from_txt(args.emb_init) first = next(iter(lookup.values())) try: assert len(first) == args.embedding_size except AssertionError: print("embedding size not compatible with pretrained embeddings.") print( "specified size {}, pretrained model includes size {}".format( args.embedding_size, len(first))) sys.exit(1) matrix = torch.randn_like(caption_gen.emb.weight) for char, vec in lookup.items(): if char in text_proc.vocab.stoi.keys(): id = text_proc.vocab.stoi[char] matrix[id, :] = torch.tensor(vec) caption_gen.init_embedding(matrix) print("{} | successfully initialized".format( sec2str(time.time() - begin), args.emb_init)) # move models to device n_gpu = torch.cuda.device_count() if n_gpu > 1 and args.dataparallel: video_encoder = nn.DataParallel(video_encoder) caption_gen = nn.DataParallel(caption_gen) else: n_gpu = 1 print("using {} gpus...".format(n_gpu)) # loss function criterion = nn.CrossEntropyLoss(ignore_index=text_proc.vocab.stoi['<pad>']) # optimizer, scheduler params = list(video_encoder.parameters()) + list(caption_gen.parameters()) optimizer = optim.SGD([{ "params": video_encoder.parameters(), "lr": args.lr_cnn, "momentum": args.momentum_cnn }, { "params": caption_gen.parameters(), "lr": args.lr_rnn, "momentum": args.momentum_rnn }], weight_decay=args.weight_decay) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.9, patience=args.patience, verbose=True) # count parameters num_params = sum(count_parameters(model) for model in models) print("# of params in model : {}".format(num_params)) # joint training loop print("start training") begin = time.time() for ep in range(args.max_epochs): # train for epoch video_encoder, caption_gen, optimizer = train_epoch( trainloader, video_encoder, caption_gen, optimizer, criterion, device, text_proc, max_it=max_train_it, opt=args) # save models enc_save_dir = os.path.join(args.model_save_path, "encoder") enc_filename = "ep{:04d}.pth".format(ep + 1) if not os.path.exists(enc_save_dir): os.makedirs(enc_save_dir) enc_save_path = os.path.join(enc_save_dir, enc_filename) dec_save_dir = os.path.join(args.model_save_path, "decoder") dec_filename = "ep{:04d}.pth".format(ep + 1) dec_save_path = os.path.join(dec_save_dir, dec_filename) if not os.path.exists(dec_save_dir): os.makedirs(dec_save_dir) if n_gpu > 1 and args.dataparallel: torch.save(video_encoder.module.state_dict(), enc_save_path) torch.save(caption_gen.module.state_dict(), dec_save_path) else: torch.save(video_encoder.state_dict(), enc_save_path) torch.save(caption_gen.state_dict(), dec_save_path) print("saved encoder model to {}".format(enc_save_path)) print("saved decoder model to {}".format(dec_save_path)) # evaluate print("begin evaluation for epoch {} ...".format(ep + 1)) nll, ppl, metrics = validate(valloader, video_encoder, caption_gen, criterion, device, text_proc, max_it=max_val_it, opt=args) if metrics is not None: scheduler.step(metrics["METEOR"]) print( "training time {}, epoch {:04d}/{:04d} done, validation loss: {:.06f}, perplexity: {:.03f}" .format(sec2str(time.time() - begin), ep + 1, args.max_epochs, nll, ppl)) print("end training")