def main(args): cfg = setup(args) if utils.get_rank() == 0: output_dir = cfg.OUTPUT_DIR i = 0 while os.path.exists('{}/commit_{}'.format(output_dir, i)): i += 1 os.system('git log | head -n 1 > {}/commit_{}'.format(output_dir, i)) os.system('git diff --no-prefix > {}/diff_{}'.format(output_dir, i)) if args.wandb and utils.get_rank() == 0: prj_name = 'EOPSN' wandb.init(name=cfg.OUTPUT_DIR, project=prj_name, entity='openset_panoptic', config=cfg, sync_tensorboard=True, dir=cfg.OUTPUT_DIR, resume=args.resume) if args.eval_only: model = Trainer.build_model(cfg) DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( cfg.MODEL.WEIGHTS, resume=args.resume) res = Trainer.test(cfg, model, use_wandb=args.wandb) if comm.is_main_process(): verify_results(cfg, res) return res trainer = Trainer(cfg, use_wandb=args.wandb) trainer.resume_or_load(resume=args.resume) return trainer.train()
def test(self, cfg, model, evaluators=None, use_wandb=False): ret = super().test(cfg, model, evaluators) use_wandb = use_wandb or getattr(self, 'wandb', False) if use_wandb and utils.get_rank() == 0: wandb_dict = {} for k, v in ret.items(): wandb_dict.update( {k + '/' + metric: value for metric, value in v.items()}) wandb.log(wandb_dict)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url( args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) # standard PyTorch mean-std input image normalization transform = T.Compose([ T.Resize(800), T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) pimg = Image.open(args.source).convert('RGB') t0 = time.time() scores, boxes = detect(pimg, model, transform, device) print(f'inference use {time.time() - t0} seconds.') plot_results(pimg, scores, boxes)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) # if args.distributed: # sampler_train = DistributedSampler(dataset_train) # sampler_val = DistributedSampler(dataset_val, shuffle=False) # else: # sampler_train = torch.utils.data.RandomSampler(dataset_train) # sampler_val = torch.utils.data.SequentialSampler(dataset_val) # batch_sampler_train = torch.utils.data.BatchSampler( # sampler_train, args.batch_size, drop_last=True) sampler_val = torch.utils.data.SequentialSampler(dataset_val) # data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, # collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn_leimao, num_workers=args.num_workers) for inputs, labels in data_loader_val: print("---------------------") print(inputs.shape) print(labels)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) # Save our Wandb metadata if not args.no_wb: wandb.init(entity='dl-project', project='dl-final-project', name=args.wb_name, notes=args.wb_notes, reinit=True) wandb.config.epochs = args.epochs device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) # visualize_video(model, postprocessors) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of trainable params:', n_parameters) wandb.config.n_parameters = n_parameters wandb.config.n_trainable_parameters = n_parameters # better name # Log total # of model parameters (including frozen) to W&B n_total_parameters = sum(p.numel() for p in model.parameters()) print('total number of parameters:', n_total_parameters) wandb.config.n_total_parameters = n_total_parameters dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) # For visualization we want the raw images without any normalization or random resizing dataset_val_without_resize = CocoDetection( "data/coco/val2017", annFile="data/coco/annotations/instances_val2017.json", transforms=T.Compose([T.ToTensor()])) # Save metadata about training + val datasets and batch size wandb.config.len_dataset_train = len(dataset_train) wandb.config.len_dataset_val = len(dataset_val) wandb.config.batch_size = args.batch_size if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] # Not sure if we should save all hyperparameters in wandb.config? # just start with a few important ones wandb.config.lr = args.lr wandb.config.lr_backbone = args.lr_backbone if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: import copy p_groups = copy.deepcopy(optimizer.param_groups) optimizer.load_state_dict(checkpoint['optimizer']) for pg, pg_old in zip(optimizer.param_groups, p_groups): pg['lr'] = pg_old['lr'] pg['initial_lr'] = pg_old['initial_lr'] # print(optimizer.param_groups) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). args.override_resumed_lr_drop = True if args.override_resumed_lr_drop: print( 'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.' ) lr_scheduler.step_size = args.lr_drop lr_scheduler.base_lrs = list( map(lambda group: group['initial_lr'], optimizer.param_groups)) lr_scheduler.step(lr_scheduler.last_epoch) args.start_epoch = checkpoint['epoch'] + 1 # check the resumed model if not args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.eval: print("Generating visualizations...") visualize_bbox(model, postprocessors, data_loader_val, device, dataset_val_without_resize) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_file_for_wb = str( output_dir / f'{wandb.run.id}_checkpoint{epoch:04}.pth') checkpoint_paths = [ output_dir / 'checkpoint.pth', checkpoint_file_for_wb ] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # Save model checkpoint to W&B wandb.save(checkpoint_file_for_wb) # Generate visualizations for fixed(?) set of images every epoch print("Generating visualizations...") visualize_bbox(model, postprocessors, data_loader_val, device, dataset_val_without_resize) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } # Save the COCO metrics properly metric_name = [ "AP", "AP50", "AP75", "APs", "APm", "APl", "AR@1", "AR@10", "AR@100", "ARs", "ARm", "ARl" ] for i, metric_val in enumerate(log_stats["test_coco_eval_bbox"]): log_stats[metric_name[i]] = metric_val if not args.no_wb: wandb.log(log_stats) print("train_loss: ", log_stats['train_loss']) if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") wandb.save(str(output_dir / "log.txt")) # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] eval_filename_for_wb = f'{wandb.run.id}_eval_{epoch:04}.pth' eval_path_for_wb = str(output_dir / "eval" / eval_filename_for_wb) filenames = ['latest.pth', eval_filename_for_wb] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) # TODO not sure if this file will end up being too big # I think it's the COCO precision/recall metrics # in some format... # let's track it just in case to start! wandb.save(eval_path_for_wb) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) if args.output_dir is None: args.output_dir = os.path.expanduser( '~/Data/AI2Thor_detection_features/') if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_all = build_dataset(image_set='all', args=args) if args.distributed: sampler_all = DistributedSampler(dataset_all, shuffle=False) else: sampler_all = torch.utils.data.SequentialSampler(dataset_all) data_loader_all = DataLoader(dataset_all, args.batch_size, sampler=sampler_all, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) base_ds = get_coco_api_from_dataset(dataset_all) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start extracting features") start_time = time.time() extract_feature(model, criterion, postprocessors, data_loader_all, base_ds, device, args.output_dir) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Extracting features time {}'.format(total_time_str)) print('Start combining files') start_time = time.time() data_dir = os.path.expanduser('~/Data/AI2Thor_offline_data_2.0.2/') combine_files(args, data_dir) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Combining files time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) print(args) device = torch.device(args.device) # Fix the seed for reproducibility. seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, { "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) # Load from pretrained DETR model. assert args.num_queries == 100, args.num_queries assert args.enc_layers == 6 and args.dec_layers == 6 assert args.backbone in ['resnet50', 'resnet101', 'swin'], args.backbone if args.backbone == 'resnet50': pretrain_model = './data/detr_coco/detr-r50-e632da11.pth' elif args.backbone == 'resnet101': pretrain_model = './data/detr_coco/detr-r101-2c7b67e5.pth' else: pretrain_model = None if pretrain_model is not None: pretrain_dict = torch.load(pretrain_model, map_location='cpu')['model'] my_model_dict = model_without_ddp.state_dict() pretrain_dict = {k: v for k, v in pretrain_dict.items() if k in my_model_dict} my_model_dict.update(pretrain_dict) model_without_ddp.load_state_dict(my_model_dict) output_dir = Path(args.output_dir) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 10 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') if (epoch + 1) > args.lr_drop and (epoch + 1) % 10 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() # os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model( args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) # modify the dataset from coco to nvdata # home_dir = os.environ["HOME"] # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train') # dataset_val = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/test'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='test') # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) dataset_train = build_nvdataset( dataset_root=[args.dataset_root_sql, args.dataset_root_img], mode='train', camera=args.camera) dataset_val = build_nvdataset( dataset_root=[args.dataset_root_test, args.dataset_root_test], mode='test', camera=args.camera) if args.root_indices is not None: indices_50k = np.load(os.path.join(args.root_indices)) # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) dataset_train = Subset(dataset_train, indices_50k) # IPython.embed() print("Train samples: %d" % (len(dataset_train))) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) # if args.dataset_file == "coco_panoptic": # # We also evaluate AP during panoptic training, on original coco DS # coco_val = datasets.coco.build("val", args) # base_ds = get_coco_api_from_dataset(coco_val) # elif args.dataset_file == "nvdata": # coco_val = datasets.coco.build("val", args) # base_ds = get_coco_api_from_dataset(coco_val) # else: # base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 # if args.eval: # test_stats, coco_evaluator = evaluate_nvdata(model, criterion, postprocessors, # data_loader_val, base_ds, device, args.output_dir) # if args.output_dir: # utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") # return # if args.eval: # evaluate(model, dataset_val, postprocessors, device) print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 50 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # test_stats, coco_evaluator = evaluate_nvdata( # model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir # ) # log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, # **{f'test_{k}': v for k, v in test_stats.items()}, # 'epoch': epoch, # 'n_parameters': n_parameters} log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs # if coco_evaluator is not None: # (output_dir / 'eval').mkdir(exist_ok=True) # if "bbox" in coco_evaluator.coco_eval: # filenames = ['latest.pth'] # if epoch % 50 == 0: # filenames.append(f'{epoch:03}.pth') # for name in filenames: # torch.save(coco_evaluator.coco_eval["bbox"].eval, # output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) wandb.init(project="qpic-project", entity="sangbaeklee", group="experiment_qpic") wandb.config = { "learning_rate": args.lr, "epochs": args.epochs, "batch_size": args.batch_size, } if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) wandb.watch(model) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if not args.hoi: if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 elif args.pretrained: checkpoint = torch.load(args.pretrained, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if args.eval: if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) return else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) coco_evaluator = None else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } #import pdb; pdb.set_trace() if args.dataset_file == 'hico': wandb.log({ "loss": train_stats['loss'], "mAP": test_stats['mAP'], "mAP rare": test_stats['mAP rare'], "mAP non-rare": test_stats['mAP non-rare'], "mean max recall": test_stats['mean max recall'] }) elif args.dataset_file == 'vcoco': wandb.log({ "mAP_all": test_stats['mAP_all'], "mAP_thesis": test_stats['mAP_thesis'], "AP_hold_obj": test_stats['AP_hold_obj'], "AP_stand": test_stats['AP_stand'], "AP_sit_instr": test_stats['AP_sit_instr'], "AP_ride_instr": test_stats['AP_ride_instr'], "AP_walk": test_stats['AP_walk'], "AP_look_obj": test_stats['AP_look_obj'], "AP_hit_instr": test_stats['AP_hit_instr'], "AP_hit_obj": test_stats['AP_hit_obj'], "AP_eat_obj": test_stats['AP_eat_obj'], "AP_eat_instr": test_stats['AP_eat_instr'], "AP_jump_instr": test_stats['AP_jump_instr'], "AP_lay_instr": test_stats['AP_lay_instr'], "AP_talk_on_phone_instr": test_stats['AP_talk_on_phone_instr'], "AP_carry_obj": test_stats['AP_carry_obj'], "AP_throw_obj": test_stats['AP_throw_obj'], "AP_catch_obj": test_stats['AP_catch_obj'], "AP_cut_instr": test_stats['AP_cut_instr'], "AP_cut_obj": test_stats['AP_cut_obj'], "AP_run": test_stats['AP_run'], "AP_work_on_computer_instr": test_stats['AP_work_on_computer_instr'], "AP_ski_instr": test_stats['AP_ski_instr'], "AP_surf_instr": test_stats['AP_surf_instr'], "AP_skateboard_instr": test_stats['AP_skateboard_instr'], "AP_smile": test_stats['AP_smile'], "AP_drink_instr": test_stats['AP_drink_instr'], "AP_kick_obj": test_stats['AP_kick_obj'], "AP_point_instr": test_stats['AP_point_instr'], "AP_read_obj": test_stats['AP_read_obj'], "AP_snowboard_instr": test_stats['AP_snowboard_instr'],\ "loss" : train_stats['loss'] }) else: continue if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def add_exemplar(self, exemplar_info, void_features, void_proposals, image_path, flips, dir_name='pseudo_gts'): exemplar_features, exemplar_labels, exemplar_length = exemplar_info p = image_path[0].split('/')[-1].split('.')[0] templete = image_path[0].replace(p, '{:012}') if self.step % 100 == 0: # sync multi-gpus self.sync_pseudo_gt(templete, dir_name) if len(exemplar_features) == 0 or len(void_features) == 0: if utils.get_rank() == 0: storage = get_event_storage() storage.put_scalar("exemplar/add_exemplar", 0) return None boxes = [x.proposal_boxes.tensor for x in void_proposals] l = [len(b) for b in boxes] sizes = [x._image_size for x in void_proposals] cos = get_cos_sim(void_features, exemplar_features) th = max(0.01, self.cos_thresh - (0.01 * self.n_pseudo_gt / 200)) if float(cos.max()) < 1 - th: if utils.get_rank() == 0: storage = get_event_storage() storage.put_scalar("exemplar/add_exemplar", 0) return None cos = cos.split(l) data = [] cos_log = [] label_log = [] new_label = [ -torch.ones((len(x), ), device=cos[0].device) for x in void_proposals ] for i, (c, bbox, p, s) in enumerate(zip(cos, boxes, image_path, sizes)): H, W = s area = (bbox[:, 2] - bbox[:, 0]) * (bbox[:, 3] - bbox[:, 1]) ind = size_condition(area, self.size_opt) bbox = bbox[ind] nonzero_ind = ind.nonzero() if len(bbox) == 0: continue c = c[ind] score, ind = c.view(len(bbox), -1).max(dim=0) bbox = bbox[ind] cc = score labels = exemplar_labels nonzero_ind = nonzero_ind[ind] ind = cc > 1 - th cc = cc[ind] bbox = bbox[ind] nonzero_ind = nonzero_ind[ind] keep = nms(bbox, cc, self.nms_thresh) bbox = bbox[keep] cc = cc[keep] l = labels[keep] nonzero_ind = nonzero_ind[keep] bbox = bbox.div(torch.as_tensor([[W, H, W, H]], device=bbox.device)) if flips[i] == 1: bbox[:, 0] = 1 - bbox[:, 0] bbox[:, 2] = 1 - bbox[:, 2] bbox = torch.index_select( bbox, -1, torch.as_tensor([2, 1, 0, 3], device=bbox.device)) labels = l.view(-1, 1).float() new_label[i][nonzero_ind] = labels path = int(p.split('/')[-1].split('.')[0]) pa = torch.ones((len(bbox), 1), device=bbox.device) * path datum = torch.cat((pa, labels, bbox), dim=-1) data.append(datum) cos_log.append(cc) label_log.append(l) if len(data) > 0: dir_name = os.path.join(self.output_dir, dir_name) data = torch.cat(data) self.pseudo_gt = torch.cat((self.pseudo_gt, data)) if utils.get_rank() == 0: storage = get_event_storage() storage.put_scalar("exemplar/add_exemplar", len(data)) return new_label
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) # IPython.embed() device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model( args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_val = build_nvdataset( dataset_root=[ os.path.join(os.environ["HOME"], 'datasets/detection-f'), #test os.path.join(os.environ["HOME"], 'datasets/frames_nvidia') ], mode='test', camera=args.camera) # dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql], # mode='test', camera=args.camera) print("Validation samples: %d" % (len(dataset_val))) IPython.embed() # compute how many boxes in the test dataset for each image # accumulate_bboxes_numbers(dataset_val) # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train', camera=args.camera) # # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) # # dataset_train = Subset(dataset_train_, indices_50k) # print("Train samples: %d"%(len(dataset_train_))) # print(len(dataset_val)) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) # args.resume = os.path.join(os.environ["HOME"], 'datasets/exps_detr_base/checkpoint0299.pth') # args.resume = '/home/shuxuang/datasets/exps_detr_base/checkpoint0299.pth' log_path = args.resume log = os.path.join(args.resume, 'log.txt') # read_log(log) # IPython.embed() args.resume = os.path.join(args.resume, 'checkpoint.pth') print(args.resume) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: print('Loading model: %s' % args.resume) checkpoint = torch.load(args.resume, map_location='cpu') print('Load model from %d epoch' % checkpoint['epoch']) model_without_ddp.load_state_dict(checkpoint['model']) if args.eval: vis_bboxes(model, dataset_val, postprocessors, device) # inference_time(model, dataset_val, postprocessors, device) return model, dataset_val, postprocessors, device
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) # align with DETR format args.dataset_file = 'ImageNet' args.masks = None # freeze cnn weights args.lr_backbone = 0 if args.fre_cnn else args.lr print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.updetr_collate_fn, num_workers=args.num_workers) print(len(data_loader_train) * args.epochs) output_dir = Path(args.output_dir) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) if lr_scheduler.step_size != args.lr_drop: lr_scheduler.step_size = args.lr_drop args.start_epoch = checkpoint['epoch'] + 1 print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 20 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 20 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args, exp_cfg): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) print(args) # device = torch.device('cuda') device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # model, criterion, postprocessors = build_model(args) model = SMPLXNet(exp_cfg) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # for n, p in model_without_ddp.named_parameters(): # print(n) # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) print('start build dataset') datasets = make_all_datasets(exp_cfg, split='train') # dataset_train = ConcatDataset(datasets['body']) dataset_train = ConcatDataset(datasets['body'] + datasets['hand'] + datasets['head']) print('finish build dataset') sample_weight = [ child_dataset.sample_weight for child_dataset in dataset_train.datasets ] sample_weight = np.concatenate(sample_weight, axis=0) sampler_train = torch.utils.data.sampler.WeightedRandomSampler( sample_weight, len(dataset_train)) # sampler_val = torch.utils.data.SequentialSampler(dataset_val) if args.distributed: sampler_train = samplers.DistributedSampler(sampler_train) # sampler_val = samplers.DistributedSampler(sampler_val, shuffle=False) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) collate_fn = functools.partial(collate_batch, use_shared_memory=args.num_workers > 0, return_full_imgs=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=collate_fn, num_workers=args.num_workers, pin_memory=True) # data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, # drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, # pin_memory=True) optim_cfg = exp_cfg.get('optim', {}) optimizer = build_optimizer(model, optim_cfg) lr_scheduler = build_scheduler(optimizer, optim_cfg['scheduler']) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.pretrain: checkpoint = torch.load(args.pretrain, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: import copy p_groups = copy.deepcopy(optimizer.param_groups) optimizer.load_state_dict(checkpoint['optimizer']) for pg, pg_old in zip(optimizer.param_groups, p_groups): pg['lr'] = pg_old['lr'] pg['initial_lr'] = pg_old['initial_lr'] print(optimizer.param_groups) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) lr_scheduler.step(lr_scheduler.last_epoch) args.start_epoch = checkpoint['epoch'] + 1 print("Start training") output_dir = Path(args.output_dir) start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, data_loader_train, optimizer, device, epoch) # print('DEBUG!!!!!!!!!'); train_stats = {} lr_scheduler.step() if args.output_dir: if not os.path.exists(args.output_dir) and utils.is_main_process(): os.makedirs(args.output_dir) checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 1 epochs if (epoch + 1) % args.save_freq == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) image_set = 'fewshot' if args.fewshot_finetune else 'train' dataset_train = build_dataset(image_set=image_set, args=args) dataset_val = build_dataset(image_set='val', args=args) dataset_support = build_support_dataset(image_set=image_set, args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) sampler_support = samplers.NodeDistributedSampler(dataset_support) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) sampler_support = samplers.DistributedSampler(dataset_support) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_support = torch.utils.data.RandomSampler(dataset_support) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=False) loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) loader_val = DataLoader(dataset_val, batch_size=args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) loader_support = DataLoader(dataset_support, batch_size=1, sampler=sampler_support, drop_last=False, num_workers=args.num_workers, pin_memory=False) def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) if not args.fewshot_finetune: param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, "initial_lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, "initial_lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, "initial_lr": args.lr * args.lr_linear_proj_mult, }] else: # For few-shot finetune stage, do not train sampling offsets, reference points, and embedding related parameters param_dicts = [ { "params": [p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and \ not match_name_keywords(n, args.lr_linear_proj_names) and \ not match_name_keywords(n, args.embedding_related_names) and p.requires_grad], "lr": args.lr, "initial_lr": args.lr, }, { "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad], "lr": args.lr_backbone, "initial_lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, weight_decay=args.weight_decay) lr_scheduler = WarmupMultiStepLR(optimizer, args.lr_drop_milestones, gamma=0.1, warmup_epochs=args.warmup_epochs, warmup_factor=args.warmup_factor, warmup_method='linear', last_epoch=args.start_epoch - 1) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.dataset.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if args.fewshot_finetune: if args.category_codes_cls_loss: # Re-init weights of novel categories for few-shot finetune novel_class_ids = datasets.get_class_ids(args.dataset_file, type='novel') if args.num_feature_levels == 1: for novel_class_id in novel_class_ids: nn.init.normal_(model_without_ddp.category_codes_cls.L. weight[novel_class_id]) elif args.num_feature_levels > 1: for classifier in model_without_ddp.category_codes_cls: for novel_class_id in novel_class_ids: nn.init.normal_( classifier.L.weight[novel_class_id]) else: raise RuntimeError if args.eval: # Evaluate only base categories test_stats, coco_evaluator = evaluate(args, model, criterion, postprocessors, loader_val, loader_support, base_ds, device, type='base') if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval_base.pth") # Evaluate only novel categories test_stats, coco_evaluator = evaluate(args, model, criterion, postprocessors, loader_val, loader_support, base_ds, device, type='novel') if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval_novel.pth") return print("Start training...") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(args, model, criterion, loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() # Saving Checkpoints after each epoch if args.output_dir and (not args.fewshot_finetune): checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # Saving Checkpoints every args.save_every_epoch epoch(s) if args.output_dir: checkpoint_paths = [] if (epoch + 1) % args.save_every_epoch == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # Evaluation and Logging if (epoch + 1) % args.eval_every_epoch == 0: if 'base' in args.dataset_file: evaltype = 'base' else: evaltype = 'all' if args.fewshot_finetune: evaltype = 'novel' test_stats, coco_evaluator = evaluate(args, model, criterion, postprocessors, loader_val, loader_support, base_ds, device, type=evaltype) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters, 'evaltype': evaltype } if args.output_dir and utils.is_main_process(): with (output_dir / "results.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): print("Starting initial Testing") batch_size = 1 nframes = 20 nframes_val = 2 num_steps = 20 # im_h = 320 # im_w = 320 im_h = 240 im_w = 432 # size = (480, 864) size = (im_h, im_w) def image_read(path): pic = Image.open(path) transform = tv.transforms.Compose([ tv.transforms.Resize(size, interpolation=Image.BILINEAR), tv.transforms.ToTensor(), tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) ]) return transform(pic) def label_read(path): if os.path.exists(path): pic = Image.open(path) transform = tv.transforms.Compose([ tv.transforms.Resize(size, interpolation=Image.NEAREST), LabelToLongTensor() ]) label = transform(pic) else: label = torch.LongTensor(1, *size).fill_( 255) # Put label that will be ignored return label # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) data_folder = '../data/DAVIS/' val_set = DAVIS17V2(data_folder, '2017', 'val', image_read, label_read, None, nframes_val) print("Start testing") device = torch.device(args.device) model = build(args) model.to(device) # model = torch.nn.parallel.DataParallel(model) output_dir = './model/' + args.name model_file = output_dir + '/epoch_80.pth' # model = torch.nn.parallel.DataParallel(model) # model_dict = model.state_dict() # pretrained_dict = torch.load(model_file) # # from collections import OrderedDict # new_state_dict = OrderedDict() # for k, v in pretrained_dict.items(): # name = k[7:] # remove `module.` ## 多gpu 训练带moudule默认参数名字,预训练删除 # new_state_dict[name] = v # # model_dict.update(new_state_dict) # model.load_state_dict(model_dict) model.load_state_dict(torch.load(model_file)) model.eval() start_time = time.time() seg_total = 0 score_thresh = [0.1, 0.2, 0.3, 0.4, 0.5] J_value = np.zeros(len(score_thresh), dtype=np.float) F_value = np.zeros(len(score_thresh), dtype=np.float) with torch.no_grad(): for seqname, video_parts in val_set.get_video_generator(): video_frames = 0 for video_part in video_parts: images, segannos, sentence, object_id, fnames = read_video_part( video_part, device) # Read data vos_images = images.view([-1, 3, im_h, im_w]) vos_segannos = segannos.float().view([-1, 1, im_h, im_w]).numpy() vos_sentence = sentence.view(-1, num_steps) outputs = model(vos_images, vos_sentence, int(vos_images.shape[0])) outputs = outputs[-1].sigmoid().cpu().detach().numpy() # predicts = outputs[3].softmax(dim=1).max(1)[1].cpu().detach().numpy() # predicts = ((outputs >= 0.5) * 255).astype(np.uint8) # save_file = os.path.join('Mask', seqname) # if not os.path.isdir(save_file): # os.mkdir(save_file) # for i in range(predicts.shape[0]): # img = predicts[i][0] # img = cv2.applyColorMap(img, cv2.COLORMAP_JET) # cv2.imwrite('./' + save_file + '/' + str(object_id) + '_' + str(i + video_frames) + '.jpg', img) for n_score in range(len(score_thresh)): predicts = (outputs >= score_thresh[n_score]).astype( np.float32) num_frames = predicts.shape[0] for n_frame in range(num_frames): mean_IoU = compute_mean_IoU( predicts[n_frame].squeeze(), vos_segannos[n_frame].squeeze()) J_value[n_score] += mean_IoU mean_F = f_measure(predicts[n_frame].squeeze(), vos_segannos[n_frame].squeeze()) F_value[n_score] += mean_F if score_thresh[n_score] == 0.5: print( 'Seqname: {}, object_id: {}, frame: {}, IoU : {}, F : {}' .format(seqname, object_id, video_frames + n_frame + 1, mean_IoU, mean_F)) # num_frames = predicts.shape[0] # for n_frame in range(num_frames): # # mean_IoU = compute_mean_IoU(predicts[n_frame].squeeze(), vos_segannos[n_frame].squeeze()) # J_value[0] += mean_IoU # # mean_F = f_measure(predicts[n_frame].squeeze(), vos_segannos[n_frame].squeeze()) # F_value[0] += mean_F # # # print('Seqname: {}, object_id: {}, frame: {}, IoU : {}, F : {}'.format(seqname, object_id, # video_frames + n_frame + 1, # mean_IoU, mean_F)) seg_total += num_frames video_frames += num_frames # msg = 'cumulative IoU = %f' % (cum_I / cum_U) # for n_eval_iou in range(len(eval_seg_iou_list)): # eval iou > .5, .6, .7, .8, .9 # eval_seg_iou = eval_seg_iou_list[n_eval_iou] # seg_correct[n_eval_iou] += (I / U >= eval_seg_iou) # Print results print('Segmentation evaluation') for n_score in range(len(score_thresh)): result_str = '' # for n_eval_iou in range(len(eval_seg_iou_list)): # result_str += 'precision@%s = %f\n' % \ # (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou] / seg_total) result_str += 'threshold %.2f: J = %.4f, F = %.4f' % ( score_thresh[n_score], J_value[n_score] / seg_total, F_value[n_score] / seg_total) print(result_str) # result_str = '' # # for n_eval_iou in range(len(eval_seg_iou_list)): # # result_str += 'precision@%s = %f\n' % \ # # (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou] / seg_total) # result_str += 'J = %.4f, F = %.4f' % (J_value[0] / seg_total, F_value[0] / seg_total) # print(result_str) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Testing time {}'.format(total_time_str))
def sync_pseudo_gt(self, templete=None, dir_name='pseudo_gts'): size = utils.get_world_size() if self.pseudo_gt is None or size == 1: return try: data = self.pseudo_gt[self.n_pseudo_gt:].view(-1, 6) path = data[:, 0].long() label = data[:, 1] boxes = data[:, 2:] dir_name = os.path.join(self.output_dir, dir_name) if not os.path.exists(dir_name): os.mkdir(dir_name) for p in path.unique(): img = cv2.imread(templete.format(p)) img_h, img_w, _ = img.shape multiplier = torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32, device=data.device) idx = path == p bbox = boxes[idx] bbox = bbox * multiplier bbox = bbox.int().cpu().numpy() lbl = label[idx] if not os.path.exists(dir_name + '/{:05}'.format(self.step)): os.mkdir(dir_name + '/{:05}'.format(self.step)) for i, box in enumerate(bbox): cropped_image = img[box[1]:box[3] + 1, box[0]:box[2] + 1] framed_image = cropped_image.astype(np.uint8) cropped_img_path = os.path.join( dir_name, '{:05}/{:03}_{:012}_{:03}.jpg'.format( self.step, int(lbl[i]), int(p), i)) out = cv2.imwrite(cropped_img_path, framed_image) if not out: print("FAIL TO SAVE") except: print("FAIL TO SAVE") rank = utils.get_rank() array = torch.zeros((size, 1), device=self.pseudo_gt.device) array[rank] = len(self.pseudo_gt) - self.n_pseudo_gt dist.all_reduce(array, dist.ReduceOp.SUM) data = self.pseudo_gt[self.n_pseudo_gt:] max_size = int(array.max()) data = torch.cat((data, torch.zeros((max_size - len(data), data.shape[1]), device=data.device))) input_list = [ torch.empty(size=(max_size, 6), device=self.pseudo_gt.device) for i in array ] dist.all_gather(input_list, data) input_list = [e[:int(array[i])] for i, e in enumerate(input_list)] data = torch.cat(input_list) print("{} data sync".format(len(data))) self.pseudo_gt = torch.cat((self.pseudo_gt, data)) self.n_pseudo_gt = len(self.pseudo_gt) if utils.get_rank() == 0: print(array) torch.save( self.pseudo_gt.cpu(), os.path.join(self.output_dir, 'pseudo_gts/{}.pth'.format(self.step)))
def clustering(self, image_path=None): # sync data self.sync_pseudo_gt() feature = self.gather(self.feature_memory) obj_score = self.gather(self.obj_score_memory) paths = self.gather(self.path_memory) bbox = self.gather(self.bbox_memory) self.feature_memory = [] self.obj_score_memory = [] self.path_memory = [] self.bbox_memory = [] if utils.get_rank() == 0 and self.cls_weight.weight.sum() < len( self.cls_weight.weight): ids, centroid, var = clustering(feature, K=self.num_centroid, step=self.step, device=feature.device, tol=1e-3, Niter=150) count = torch.bincount(ids) mean_obj_score = torch.bincount( ids, weights=obj_score.to(ids.device)) / (count + 1e-6) # top 10 % dense clusters. dist_topk_bound = -torch.topk( -var.view(-1), k=min(len(mean_obj_score), 13)).values[-1] mask = var < dist_topk_bound # number of found unknown classes cls_weight = sum(self.cls_weight.weight) - self.num_classes # high objectness clusters. cluster_obj_thresh = min( self.cluster_obj_thresh * (1 + cls_weight / len(self.cls_weight.weight)), 0.99) obj_mask = mean_obj_score.to(mask.device) > cluster_obj_thresh mask = torch.logical_and(mask, obj_mask.to(mask.device)) mask = mask.bool().view(-1) ids = ids.long().view(-1) paths = paths[mask[ids]] bbox = bbox[mask[ids]] feature = feature[mask[ids]] obj_score = obj_score[mask[ids]] ids = ids[mask[ids]] centroid = centroid[mask] if len(obj_score) > 0: obj_thresh = min(self.coupled_obj_thresh, max(obj_score)) else: obj_thresh = self.coupled_obj_thresh obj_thresh = obj_thresh + (self.n_pseudo_gt * 0.01 / 100) obj_thresh = min(obj_thresh, 0.99) idx = obj_score >= obj_thresh bbox = bbox[idx] feature = feature[idx] paths = paths[idx] obj_score = obj_score[idx] ids = ids[idx] feats = [] boxes = [] ps = [] obj_scores = [] new_ids = [] cls_weight = sum(self.cls_weight.weight) - self.num_classes coupled_cos_thresh = self.coupled_cos_thresh * ( 1 - cls_weight / len(self.cls_weight.weight)) coupled_cos_thresh = max(coupled_cos_thresh, 0.01) for i, l in enumerate(sorted(ids.unique())): idx = ids == l feat = feature[idx] bb = bbox[idx] path = paths[idx] obj = obj_score[idx] cos_sim = get_cos_sim(feat, feat).view(-1) cos_dist = 1 - cos_sim idx = cos_dist.argsort() used = [] used_path = [] printer = cos_sim[idx] printer = printer[printer < 0.99999] # eliminate same element pairs for v in idx: x, y = v // len(feat), v % len(feat) if cos_dist[v] > coupled_cos_thresh: break if path[x] != path[y] and path[ x] not in used_path and path[y] not in used_path: used.append(x) used.append(y) used_path.append(path[x]) used_path.append(path[y]) if len(used) > 0: idx = torch.as_tensor(used, device=feat.device) temp_ids = torch.ones( (len(used), ), device=feat.device) * l feats.append(feat[idx]) boxes.append(bb[idx]) ps.append(path[idx]) obj_scores.append(obj[idx]) new_ids.append(temp_ids) if len(feats) > 0: feature = torch.cat(feats) bbox = torch.cat(boxes) paths = torch.cat(ps) obj_score = torch.cat(obj_scores) ids = torch.cat(new_ids) cls_weight = self.cls_weight.weight start_l = int(cls_weight.sum() ) + self.original_num_classes - self.num_classes labels = -ids - 1 unique_label = labels.unique() unique_label = unique_label[:cls_weight.shape[0] - int(cls_weight.sum())] for i, p in enumerate(unique_label): if i + start_l - self.original_num_classes == self.num_centroid: break labels[labels == p] = i + start_l idx = labels > 0 obj_score = obj_score[idx] labels = labels[idx] paths = paths[idx] feature = feature[idx] bbox = bbox[idx] data = torch.cat( (paths.unsqueeze(1), labels.unsqueeze(1).float(), bbox), dim=-1) else: data = torch.zeros((0, 6), device=feature.device) if image_path is not None and len(data) > 0: utils.save_boxes(data, feature.detach(), obj_score.detach(), image_path, self.pal, self.step, self.num_classes, self.output_dir) size = torch.as_tensor([len(data), len(centroid)], device=feature.device).float() storage = get_event_storage() storage.put_scalar("exemplar/obj_th", float(obj_thresh)) storage.put_scalar("exemplar/cluster_obj_th", float(cluster_obj_thresh)) storage.put_scalar("exemplar/sel_cluster", int(mask.sum())) storage.put_scalar("exemplar/coupled_cos_th", float(coupled_cos_thresh)) storage.put_scalar("exemplar/new", len(data)) else: size = torch.empty(size=(1, 2), device=feature.device) # gather if utils.get_world_size() > 1: torch.cuda.synchronize() dist.broadcast(size, 0) if utils.get_rank() > 0: data = torch.empty(size=(int(size[0, 0]), 6), device=feature.device) torch.cuda.synchronize() dist.broadcast(data, 0) l_cls = self.original_num_classes - 1 l_new = int(data[:, 1].max() - l_cls) if len(data) > 0 else 0 cls_weight = self.cls_weight.weight.data cls_weight[:self.num_classes + l_new] = 1 self.cls_weight.weight.data = cls_weight if self.pseudo_gt is None: self.pseudo_gt = data else: self.pseudo_gt = torch.cat((self.pseudo_gt, data)) self.n_pseudo_gt = len(self.pseudo_gt) # flush if utils.get_rank() == 0: try: torch.save( self.pseudo_gt.cpu(), os.path.join(self.output_dir, 'pseudo_gts/{}.pth'.format(self.step))) except: pass
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(split='train', args=args) dataset_val = build_dataset(split='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) elif args.dataset_file == "coco": base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 # if args.eval: # if 'coco' in args.dataset_file: # test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, # data_loader_val, base_ds, device, args.output_dir) # if args.output_dir: # utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") # elif 'anet' == args.dataset_file: # evaluate3d(model, postprocessors, data_loader_val, device, epoch=0) # return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if epoch % args.eval_freq == 0: if 'coco' in args.dataset_file: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) elif 'anet' == args.dataset_file: evaluate3d(model, postprocessors, data_loader_val, device, epoch)
def losses(self, predictions, proposals, void_predictions, void_proposals, image_path=None, flips=None, use_exemplar=False): """ Args: predictions: return values of :meth:`forward()`. proposals (list[Instances]): proposals that match the features that were used to compute predictions. """ if utils.get_rank() == 0: storage = get_event_storage() storage.put_scalar( "exemplar/num_pseudo_gt", len(self.pseudo_gt) if self.pseudo_gt is not None else 0) scores, proposal_deltas, feature = predictions void_scores, _, void_feature = void_predictions if len(void_scores) > 0: neg_sample = void_scores storage = get_event_storage() storage.put_scalar("exemplar/num_neg_sample", len(neg_sample)) void_neg_loss = -torch.log( 1 - neg_sample.softmax(-1)[:, :self.num_classes - 1] + 1e-8) if len(void_neg_loss) > 0: void_neg_loss = void_neg_loss.sum() / len(void_neg_loss) else: void_neg_loss = void_neg_loss.sum() else: void_neg_loss = scores.sum() * 0 void_loss = {'loss_void_neg': void_neg_loss} if use_exemplar: a, b, c = void_predictions l = sum([len(x) for x in void_proposals[:-1]]) self.add_feature(predictions, proposals, (a[:l], b[:l], c[:l]), void_proposals[:-1], image_path[:-1], flips[:-1]) else: self.add_feature(predictions, proposals, void_predictions, void_proposals, image_path, flips) frcnn_outputs = FastRCNNOutputs( self.box2box_transform, scores, proposal_deltas, proposals, self.smooth_l1_beta, self.box_reg_loss_type, self.box_reg_loss_weight, self.label_converter, add_unlabeled_class=self.add_unlabeled_class, cls_weight=self.cls_weight.weight.view(-1), bg_class_ind=self.num_classes - 1) losses = frcnn_outputs.losses() self.step += 1 losses.update(void_loss) return losses
def main(args): utils.init_distributed_mode(args) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, batch_size=1, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: io.load_frozen(args, model_without_ddp) output_dir = Path(args.output_dir) if args.resume: io.resume(args, model_without_ddp, optimizer, lr_scheduler) elif args.finetune: io.finetune(args, model_without_ddp) if args.eval: if args.output_dir and utils.is_main_process(): io.init_wandb(args.dataset_file + "-detr-eval", model, args, n_parameters=n_parameters) test_stats, evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: io.save_on_master(evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() if args.output_dir and utils.is_main_process(): io.init_wandb(args.dataset_file + "-detr", model, args, n_parameters=n_parameters) for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: io.save_checkpoint(args, model_without_ddp, optimizer, lr_scheduler, epoch) test_stats, evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, epoch) if utils.is_main_process() and args.output_dir: io.log_wandb(train_stats, test_stats) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) # save final model if utils.is_main_process() and args.output_dir: io.save_on_master(model_without_ddp, output_dir / "model_final.pth") print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) Dataset = get_dataset(args.dataset, args.task) f = open(args.data_cfg) data_config = json.load(f) trainset_paths = data_config['train'] dataset_root = data_config['root'] f.close() normalize = T.Compose([ T.ToTensor(), T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) scales = [480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800] transforms = T.Compose([ T.RandomHorizontalFlip(), T.RandomSelect( T.RandomResize(scales, max_size=1333), T.Compose([ T.RandomResize([400, 500, 600]), T.RandomSizeCrop(384, 600), # T.RandomSizeCrop_MOT(384, 600), T.RandomResize(scales, max_size=1333), ])), normalize, ]) dataset_train = Dataset(args, dataset_root, trainset_paths, (1088, 608), augment=True, transforms=transforms) args.nID = dataset_train.nID model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) # sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) # sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) # sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, # drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, # pin_memory=True) # data_loader_train = torch.utils.data.DataLoader( # dataset_train, # batch_size=args.batch_size, # shuffle=True, # num_workers=args.num_workers, # pin_memory=True, # drop_last=True # ) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) # 用于将classifer不更新参数 # for name,p in model_without_ddp.named_parameters(): # if name.startswith('classifier'): # p.requires_grad = False param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # optimizer.add_param_group({'params': criterion.parameters()}) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_dict = model_without_ddp.state_dict() #当前模型参数 pretrained_dict = { k: v for k, v in checkpoint['model'].items() if k not in [ "class_embed.0.weight", "class_embed.0.bias", "class_embed.1.weight", "class_embed.1.bias", "class_embed.2.weight", "class_embed.2.bias", "class_embed.3.weight", "class_embed.3.bias", "class_embed.4.weight", "class_embed.4.bias", "class_embed.5.weight", "class_embed.5.bias" ] } model_dict.update(pretrained_dict) # missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False) missing_keys, unexpected_keys = model_without_ddp.load_state_dict( model_dict, strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: args.start_epoch = checkpoint['epoch'] + 1 # optimizer.load_state_dict(checkpoint['optimizer']) # if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: # import copy # p_groups = copy.deepcopy(optimizer.param_groups) # # optimizer.load_state_dict(checkpoint['optimizer']) # for pg, pg_old in zip(optimizer.param_groups, p_groups): # pg['lr'] = pg_old['lr'] # pg['initial_lr'] = pg_old['initial_lr'] # # print(optimizer.param_groups) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). # args.override_resumed_lr_drop = True # if args.override_resumed_lr_drop: # print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.') # lr_scheduler.step_size = args.lr_drop # lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) # lr_scheduler.step(lr_scheduler.last_epoch) # model.add_module('id') # [p for p in model.named_parameters() if not p[1].requires_grad] # 用于将classifer不更新参数 # optimizer = torch.optim.SGD(filter(lambda x: "classifier" not in x[0], model.parameters()), lr=args.lr, # momentum=0.9, weight_decay=1e-4) # model.classifier.training = False n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(args, model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return #cab writer = SummaryWriter("runs/" + args.tb_name) best_value = 0 print("Start training, best_value is " + str(best_value)) start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) #cab for k, v in train_stats.items(): if isinstance(v, float): writer.add_scalar(f'train_{k}', v, epoch) new_value = 0 for k, v in test_stats.items(): if (isinstance(v, float)): writer.add_scalar(f'test_{k}', v, epoch) if (k == "coco_eval_bbox"): new_value = v[0] writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', v[0], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ]', v[1], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ]', v[2], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', v[3], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', v[4], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', v[5], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ]', v[6], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ]', v[7], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', v[8], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', v[9], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', v[10], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', v[11], epoch) if (k == "coco_eval_masks"): new_value = v[0] writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', v[0], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ]', v[1], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ]', v[2], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', v[3], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', v[4], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', v[5], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ]', v[6], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ]', v[7], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', v[8], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', v[9], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', v[10], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', v[11], epoch) print("Epoch finished, best_value is " + str(best_value)) save_pth = False if best_value < new_value: best_value = new_value save_pth = True if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') if save_pth: checkpoint_paths.append(output_dir / f'best.pth') bestLog = open(output_dir / 'best_log.txt', 'w+') bestLog.write(f'Saved model at epoch {epoch:04}\n') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) #/cab log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) dataset_test = build_dataset(image_set='test', args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) sampler_test = samplers.DistributedSampler(dataset_test, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_test = torch.utils.data.SequentialSampler(dataset_test) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_test = DataLoader(dataset_test, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') del checkpoint["model"]["transformer.decoder.class_embed.0.weight"] del checkpoint["model"]["transformer.decoder.class_embed.0.bias"] del checkpoint["model"]["transformer.decoder.class_embed.1.weight"] del checkpoint["model"]["transformer.decoder.class_embed.1.bias"] del checkpoint["model"]["transformer.decoder.class_embed.2.weight"] del checkpoint["model"]["transformer.decoder.class_embed.2.bias"] del checkpoint["model"]["transformer.decoder.class_embed.3.weight"] del checkpoint["model"]["transformer.decoder.class_embed.3.bias"] del checkpoint["model"]["transformer.decoder.class_embed.4.weight"] del checkpoint["model"]["transformer.decoder.class_embed.4.bias"] del checkpoint["model"]["transformer.decoder.class_embed.5.weight"] del checkpoint["model"]["transformer.decoder.class_embed.5.bias"] del checkpoint["model"]["transformer.decoder.class_embed.6.weight"] del checkpoint["model"]["transformer.decoder.class_embed.6.bias"] del checkpoint["model"]["class_embed.0.weight"] del checkpoint["model"]["class_embed.0.bias"] del checkpoint["model"]["class_embed.1.weight"] del checkpoint["model"]["class_embed.1.bias"] del checkpoint["model"]["class_embed.2.weight"] del checkpoint["model"]["class_embed.2.bias"] del checkpoint["model"]["class_embed.3.weight"] del checkpoint["model"]["class_embed.3.bias"] del checkpoint["model"]["class_embed.4.weight"] del checkpoint["model"]["class_embed.4.bias"] del checkpoint["model"]["class_embed.5.weight"] del checkpoint["model"]["class_embed.5.bias"] del checkpoint["model"]["class_embed.6.weight"] del checkpoint["model"]["class_embed.6.bias"] missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] # if len(missing_keys) > 0: # print('Missing Keys: {}'.format(missing_keys)) # if len(unexpected_keys) > 0: # print('Unexpected Keys: {}'.format(unexpected_keys)) # if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: # import copy # p_groups = copy.deepcopy(optimizer.param_groups) # optimizer.load_state_dict(checkpoint['optimizer']) # for pg, pg_old in zip(optimizer.param_groups, p_groups): # pg['lr'] = pg_old['lr'] # pg['initial_lr'] = pg_old['initial_lr'] # #print(optimizer.param_groups) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). # args.override_resumed_lr_drop = True # if args.override_resumed_lr_drop: # print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.') # lr_scheduler.step_size = args.lr_drop # lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) # lr_scheduler.step(lr_scheduler.last_epoch) # args.start_epoch = checkpoint['epoch'] + 1 # # check the resumed model if not args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return if args.test: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_test, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] dataset_val = build_dataset(image_set='val', args=args) sampler_val = torch.utils.data.SequentialSampler(dataset_val) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) base_ds = get_coco_api_from_dataset(dataset_val) checkpoint = torch.load(args.resume) model_without_ddp.load_state_dict(checkpoint['model']) model_without_ddp.eval() model_without_ddp.to(device) header = 'Test:' for samples, targets in data_loader_val: samples = samples.to(device) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] outputs = model_without_ddp(samples) probas = outputs['pred_logits'].softmax(-1)[0, :, :-1] keep = probas.max(-1).values > 0.7 bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep]) print(probas[keep], bboxes_scaled)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model( args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, # weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) # modify the dataset from coco to nvdata # home_dir = os.environ["HOME"] # # on local # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train') # dataset_val = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/test'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='test', camera=args.camera) dataset_val = build_nvdataset(dataset_root=[ os.path.join(os.environ["HOME"], 'datasets/test'), os.path.join(os.environ["HOME"], 'datasets/test') ], mode='test', camera=args.camera) # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) # # on maglev # dataset_train_ = build_nvdataset(dataset_root=[args.dataset_root_sql, args.dataset_root_img], # mode='train') # dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql], # mode='test', camera=args.camera) # indices_50k =np.load(os.path.join(args.root_indices)) # dataset_train = Subset(dataset_train_, indices_50k) print("Validation samples: %d" % (len(dataset_val))) # IPython.embed() if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) # args.resume = os.path.join(os.environ["HOME"], 'datasets/exps_detr_base/checkpoint0299.pth') # args.resume = '/home/shuxuang/datasets/exps_detr_base/checkpoint0299.pth' print(args.resume) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: print('Loading model: %s' % args.resume) checkpoint = torch.load(args.resume, map_location='cpu') print('Load model from %d epoch' % (checkpoint['epoch'] + 1)) model_without_ddp.load_state_dict(checkpoint['model']) if args.eval: if args.dataset_file == 'nvdata': evaluate(model, dataset_val, postprocessors, device) else: evaluate_5classes(model, dataset_val, postprocessors, device) return model, dataset_val, postprocessors, device
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print("number of params:", n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set="train", args=args) dataset_val = build_dataset(image_set="val", args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader( dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) data_loader_val = DataLoader( dataset_val, args.batch_size if args.batch_size < 4 else 4, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) elif args.dataset_file in ["cmdd", "cmdc", "wider"]: base_ds = None elif args.dataset_file == "MOT17": base_ds = dataset_val else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location="cpu") model_without_ddp.detr.load_state_dict(checkpoint["model"]) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith("https"): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location="cpu", check_hash=True) else: checkpoint = torch.load(args.resume, map_location="cpu") # NOTE: this is Bruno's hack to load stuff in model_dict = model_without_ddp.state_dict() pretrained_dict = checkpoint["model"] # hack for adding query stuff if ("query_embed.query_embed.weight" in model_dict.keys() and "query_embed.weight" in pretrained_dict.keys()): pretrained_dict[ "query_embed.query_embed.weight"] = pretrained_dict[ "query_embed.weight"] # 1. filter out unnecessary keys pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } # if finetuning skip the linear stuff if args.finetune: pretrained_dict = { k: v for k, v in pretrained_dict.items() if k not in ["class_embed.weight", "class_embed.bias"] } # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load new state dict model_without_ddp.load_state_dict(model_dict) if (not args.eval and not args.load_model_only and "optimizer" in checkpoint and "lr_scheduler" in checkpoint and "epoch" in checkpoint): optimizer.load_state_dict(checkpoint["optimizer"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) args.start_epoch = checkpoint["epoch"] + 1 if args.eval: if args.test and args.dataset_file == "wider": if args.resume: s = args.resume.split("/")[:-1] output_dir = "/" + os.path.join(*s) else: output_dir = args.output_dir print("SAVING TEST WIDER TO ", output_dir) test_wider( model, criterion, postprocessors, dataset_val, data_loader_val, device, output_dir, ) return test_stats, coco_evaluator = evaluate( model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, dset_file=args.dataset_file, ) if args.output_dir and coco_evaluator is not None: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm, ) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / "checkpoint.pth"] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f"checkpoint{epoch:04}.pth") for checkpoint_path in checkpoint_paths: utils.save_on_master( { "model": model_without_ddp.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "args": args, }, checkpoint_path, ) test_stats, coco_evaluator = evaluate( model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, dset_file=args.dataset_file, ) log_stats = { **{f"train_{k}": v for k, v in train_stats.items()}, **{f"test_{k}": v for k, v in test_stats.items()}, "epoch": epoch, "n_parameters": n_parameters, } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / "eval").mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ["latest.pth"] if epoch % 50 == 0: filenames.append(f"{epoch:03}.pth") for name in filenames: torch.save( coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name, ) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("Training time {}".format(total_time_str))
def main(args): # args = parser.parse_args() utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) print(args) if args.seed is not None: # random.seed(args.seed) # torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) ################################## # Logging setting ################################## if args.output_dir and utils.is_main_process(): logging.basicConfig( filename=os.path.join(args.output_dir, args.log_name), filemode='w', format= '%(asctime)s: %(levelname)s: [%(filename)s:%(lineno)d]: %(message)s', level=logging.INFO) warnings.filterwarnings("ignore") ################################## # Save to logging ################################## if utils.is_main_process(): logging.info(str(args)) ################################## # Initialize dataset ################################## if not args.evaluate: # build_vocab_flag=True, # Takes a long time to build a vocab train_dataset = GQATorchDataset(split='train_unbiased', build_vocab_flag=False, load_vocab_flag=False) if args.distributed: sampler_train = torch.utils.data.DistributedSampler(train_dataset) else: sampler_train = torch.utils.data.RandomSampler(train_dataset) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_sampler=batch_sampler_train, collate_fn=GQATorchDataset_collate_fn, num_workers=args.workers) # Old version # train_loader = torch.utils.data.DataLoader( # train_dataset, batch_size=args.batch_size, shuffle=True, # collate_fn=GQATorchDataset_collate_fn, # num_workers=args.workers, pin_memory=True) val_dataset_list = [] for eval_split in args.evaluate_sets: val_dataset_list.append( GQATorchDataset(split=eval_split, build_vocab_flag=False, load_vocab_flag=args.evaluate)) val_dataset = torch.utils.data.ConcatDataset(val_dataset_list) if args.distributed: sampler_val = torch.utils.data.DistributedSampler(val_dataset, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(val_dataset) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=GQATorchDataset_collate_fn, num_workers=args.workers) # Old version # val_loader = torch.utils.data.DataLoader( # val_dataset, # batch_size=args.batch_size, shuffle=False, # collate_fn=GQATorchDataset_collate_fn, # num_workers=args.workers, pin_memory=True) ################################## # Initialize model # - note: must init dataset first. Since we will use the vocab from the dataset ################################## model = PipelineModel() ################################## # Deploy model on GPU ################################## model = model.to(device=cuda) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) ################################## # define optimizer (and scheduler) ################################## # optimizer = torch.optim.SGD(model.parameters(), args.lr, # momentum=args.momentum, # weight_decay=args.weight_decay) optimizer = torch.optim.Adam( params=model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, # weight_decay=args.weight_decay amsgrad=False, ) # optimizer = torch.optim.AdamW( # params=model.parameters(), # lr=args.lr, # weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model_without_ddp.load_state_dict(checkpoint['model']) if not args.evaluate: if 'optimizer' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) if 'lr_scheduler' in checkpoint: lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) if 'epoch' in checkpoint: args.start_epoch = checkpoint['epoch'] + 1 # checkpoint = torch.load(args.resume) # args.start_epoch = checkpoint['epoch'] # model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) # print("=> loaded checkpoint '{}' (epoch {})" # .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # cudnn.benchmark = True ################################## # Define loss functions (criterion) ################################## # criterion = torch.nn.CrossEntropyLoss().cuda() text_pad_idx = GQATorchDataset.TEXT.vocab.stoi[ GQATorchDataset.TEXT.pad_token] criterion = { "program": torch.nn.CrossEntropyLoss(ignore_index=text_pad_idx).to(device=cuda), "full_answer": torch.nn.CrossEntropyLoss(ignore_index=text_pad_idx).to(device=cuda), "short_answer": torch.nn.CrossEntropyLoss().to(device=cuda), # "short_answer": torch.nn.BCEWithLogitsLoss().to(device=cuda), # sigmoid "execution_bitmap": torch.nn.BCELoss().to(device=cuda), } ################################## # If Evaluate Only ################################## if args.evaluate: validate(val_loader, model, criterion, args, DUMP_RESULT=True) return ################################## # Main Training Loop ################################## # best_acc1 = 0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: ################################## # In distributed mode, calling the :meth`set_epoch(epoch) <set_epoch>` method # at the beginning of each epoch before creating the DataLoader iterator is necessary # to make shuffling work properly across multiple epochs. # Otherwise, the same ordering will be always used. ################################## sampler_train.set_epoch(epoch) lr_scheduler.step() # adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set if (epoch + 1) % 5 == 0: validate(val_loader, model, criterion, args, FAST_VALIDATE_FLAG=False) # # remember best acc@1 and save checkpoint # save_checkpoint({ # 'epoch': epoch + 1, # # 'arch': args.arch, # 'state_dict': model.state_dict(), # # 'best_acc1': best_acc1, # 'optimizer' : optimizer.state_dict(), # }, is_best) if args.output_dir: output_dir = pathlib.Path(args.output_dir) checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # no validation ground truth for ytvos dataset dataset_train = build_dataset(image_set='train', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) output_dir = Path(args.output_dir) # load coco pretrained weight checkpoint = torch.load(args.pretrained_weights, map_location='cpu')['model'] del checkpoint["vistr.class_embed.weight"] del checkpoint["vistr.class_embed.bias"] del checkpoint["vistr.query_embed.weight"] model.module.load_state_dict(checkpoint, strict=False) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop, gamma=0.9) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) output_dir = output_dir / f"{args.backbone}_{args.transformer_type}" if args.output_dir: output_dir.mkdir(parents=True, exist_ok=True) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / f'checkpoint_{epoch}.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch}_extra.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) scaler = torch.cuda.amp.GradScaler(enabled=args.fp16) if args.det_val: assert args.eval, 'only support eval mode of detector for track' model, criterion, postprocessors = build_model(args) elif args.eval: model, criterion, postprocessors = build_tracktest_model(args) else: model, criterion, postprocessors = build_tracktrain_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_train = build_dataset(image_set=args.track_train_split, args=args) dataset_val = build_dataset(image_set=args.track_eval_split, args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: import copy p_groups = copy.deepcopy(optimizer.param_groups) optimizer.load_state_dict(checkpoint['optimizer']) for pg, pg_old in zip(optimizer.param_groups, p_groups): pg['lr'] = pg_old['lr'] pg['initial_lr'] = pg_old['initial_lr'] print(optimizer.param_groups) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). args.override_resumed_lr_drop = True if args.override_resumed_lr_drop: print( 'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.' ) lr_scheduler.step_size = args.lr_drop lr_scheduler.base_lrs = list( map(lambda group: group['initial_lr'], optimizer.param_groups)) lr_scheduler.step(lr_scheduler.last_epoch) args.start_epoch = checkpoint['epoch'] + 1 # check the resumed model # if not args.eval: # test_stats, coco_evaluator, _ = evaluate( # model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir # ) if args.eval: assert args.batch_size == 1, print("Now only support 1.") tracker = Tracker(score_thresh=args.track_thresh) test_stats, coco_evaluator, res_tracks = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, tracker=tracker, phase='eval', det_val=args.det_val, fp16=args.fp16) if args.output_dir: # utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") if res_tracks is not None: print("Creating video index for {}.".format(args.dataset_file)) video_to_images = defaultdict(list) video_names = defaultdict() for _, info in dataset_val.coco.imgs.items(): video_to_images[info["video_id"]].append({ "image_id": info["id"], "frame_id": info["frame_id"] }) video_name = info["file_name"].split("/")[0] if video_name not in video_names: video_names[info["video_id"]] = video_name assert len(video_to_images) == len(video_names) # save mot results. save_track(res_tracks, args.output_dir, video_to_images, video_names, args.track_eval_split) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, scaler, epoch, args.clip_max_norm, fp16=args.fp16) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if epoch % 10 == 0 or epoch > args.epochs - 5: test_stats, coco_evaluator, _ = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, fp16=args.fp16) log_test_stats = { **{f'test_{k}': v for k, v in test_stats.items()} } log_stats.update(log_test_stats) if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs # if coco_evaluator is not None: # (output_dir / 'eval').mkdir(exist_ok=True) # if "bbox" in coco_evaluator.coco_eval: # filenames = ['latest.pth'] # if epoch % 50 == 0: # filenames.append(f'{epoch:03}.pth') # for name in filenames: # torch.save(coco_evaluator.coco_eval["bbox"].eval, # output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))