def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() # os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model(args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) # modify the dataset from coco to nvdata # home_dir = os.environ["HOME"] # on local # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train') # dataset_val = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/test'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='test') # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) # on maglev dataset_train_ = build_nvdataset(dataset_root=[args.dataset_root_sql, args.dataset_root_img], mode='train') dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql], mode='test') indices_50k =np.load(os.path.join(args.root_indices)) dataset_train = Subset(dataset_train_, indices_50k) print("Training samples: %d"%(len(dataset_train))) # IPython.embed() if args.distributed: sampler_train = DistributedSampler(dataset_train) # sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) # sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler( sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) # data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, # drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): print("Loading pretrained model on coco") checkpoint = torch.hub.load_state_dict_from_url( args.resume, map_location='cpu', check_hash=True) # IPython.embed() else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.class_embed= torch.nn.Linear(256, 91 + 1) model_without_ddp.load_state_dict(checkpoint['model']) model_without_ddp.class_embed= torch.nn.Linear(256, 11) model.to(device) if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, { "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.eval: evaluate(model, dataset_val, postprocessors, device) print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if epoch==0 or (epoch + 1) % 25 ==0: evaluate(model, dataset_val, postprocessors, device, output_dir) # test_stats, coco_evaluator = evaluate_nvdata( # model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir # ) # log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, # **{f'test_{k}': v for k, v in test_stats.items()}, # 'epoch': epoch, # 'n_parameters': n_parameters} log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs # if coco_evaluator is not None: # (output_dir / 'eval').mkdir(exist_ok=True) # if "bbox" in coco_evaluator.coco_eval: # filenames = ['latest.pth'] # if epoch % 50 == 0: # filenames.append(f'{epoch:03}.pth') # for name in filenames: # torch.save(coco_evaluator.coco_eval["bbox"].eval, # output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, { "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) train_json = [ '/content/icpr2018/train.json', '/content/das/train.json', '/content/ctw1500/train.json', ] val_json = [ '/content/icpr2018/val.json', '/content/das/val.json', '/content/ctw1500/val.json', ] dataset_train = SimpleJsonDataset(json_path=train_json, mode='train', transforms=make_coco_transforms('train')) dataset_val = SimpleJsonDataset(json_path=val_json, mode='val', transforms=make_coco_transforms('val')) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler( sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url( args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() model.accumulate = 100 for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) model.step = epoch * args.batch_size train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # test_stats, coco_evaluator = evaluate( # model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir # ) log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, # **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs # if coco_evaluator is not None: # (output_dir / 'eval').mkdir(exist_ok=True) # if "bbox" in coco_evaluator.coco_eval: # filenames = ['latest.pth'] # if epoch % 50 == 0: # filenames.append(f'{epoch:03}.pth') # for name in filenames: # torch.save(coco_evaluator.coco_eval["bbox"].eval, # output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if True: # Create from ml-vision instead from ml.vision.models.detection.detr import detr m = detr(pretrained=True, backbone=args.backbone, deformable=True, unload_after=True) missing_keys, unexpected_keys = model_without_ddp.load_state_dict( m.state_dict(), strict=False) else: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url( args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: import copy p_groups = copy.deepcopy(optimizer.param_groups) optimizer.load_state_dict(checkpoint['optimizer']) for pg, pg_old in zip(optimizer.param_groups, p_groups): pg['lr'] = pg_old['lr'] pg['initial_lr'] = pg_old['initial_lr'] print(optimizer.param_groups) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). args.override_resumed_lr_drop = True if args.override_resumed_lr_drop: print( 'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.' ) lr_scheduler.step_size = args.lr_drop lr_scheduler.base_lrs = list( map(lambda group: group['initial_lr'], optimizer.param_groups)) lr_scheduler.step(lr_scheduler.last_epoch) args.start_epoch = checkpoint['epoch'] + 1 # check the resumed model if not args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # no validation ground truth for ytvos dataset dataset_train = build_dataset(image_set='train', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) output_dir = Path(args.output_dir) # load coco pretrained weight checkpoint = torch.load(args.pretrained_weights, map_location='cpu')['model'] del checkpoint["vistr.class_embed.weight"] del checkpoint["vistr.class_embed.bias"] del checkpoint["vistr.query_embed.weight"] model.module.load_state_dict(checkpoint, strict=False) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) device = torch.device(args.device) model = fasterrcnn_resnet_fpn(num_classes=2, pretrained=False) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) params = [p for p in model_without_ddp.parameters() if p.requires_grad] optimizer = torch.optim.AdamW(params, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) # sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) # sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) # data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, collate_fn=utils.collate_fn, # drop_last=False, num_workers=args.num_workers) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 # if args.eval: # test_stats = evaluate(model, data_loader_val, device=device) print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, optimizer, data_loader_train, device, epoch) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 1 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # test_stats = evaluate(model, data_loader_val, device) # log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, # **{f'test_{k}': v for k, v in test_stats.items()}, # 'epoch': epoch, # 'n_parameters': n_parameters} # if args.output_dir and utils.is_main_process(): # with (output_dir / "log.txt").open("a") as f: # f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): # args = parser.parse_args() utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) print(args) if args.seed is not None: # random.seed(args.seed) # torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) ################################## # Logging setting ################################## if args.output_dir and utils.is_main_process(): logging.basicConfig( filename=os.path.join(args.output_dir, args.log_name), filemode='w', format= '%(asctime)s: %(levelname)s: [%(filename)s:%(lineno)d]: %(message)s', level=logging.INFO) warnings.filterwarnings("ignore") ################################## # Save to logging ################################## if utils.is_main_process(): logging.info(str(args)) ################################## # Initialize dataset ################################## if not args.evaluate: # build_vocab_flag=True, # Takes a long time to build a vocab train_dataset = GQATorchDataset(split='train_unbiased', build_vocab_flag=False, load_vocab_flag=False) if args.distributed: sampler_train = torch.utils.data.DistributedSampler(train_dataset) else: sampler_train = torch.utils.data.RandomSampler(train_dataset) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_sampler=batch_sampler_train, collate_fn=GQATorchDataset_collate_fn, num_workers=args.workers) # Old version # train_loader = torch.utils.data.DataLoader( # train_dataset, batch_size=args.batch_size, shuffle=True, # collate_fn=GQATorchDataset_collate_fn, # num_workers=args.workers, pin_memory=True) val_dataset_list = [] for eval_split in args.evaluate_sets: val_dataset_list.append( GQATorchDataset(split=eval_split, build_vocab_flag=False, load_vocab_flag=args.evaluate)) val_dataset = torch.utils.data.ConcatDataset(val_dataset_list) if args.distributed: sampler_val = torch.utils.data.DistributedSampler(val_dataset, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(val_dataset) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=GQATorchDataset_collate_fn, num_workers=args.workers) # Old version # val_loader = torch.utils.data.DataLoader( # val_dataset, # batch_size=args.batch_size, shuffle=False, # collate_fn=GQATorchDataset_collate_fn, # num_workers=args.workers, pin_memory=True) ################################## # Initialize model # - note: must init dataset first. Since we will use the vocab from the dataset ################################## model = PipelineModel() ################################## # Deploy model on GPU ################################## model = model.to(device=cuda) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) ################################## # define optimizer (and scheduler) ################################## # optimizer = torch.optim.SGD(model.parameters(), args.lr, # momentum=args.momentum, # weight_decay=args.weight_decay) optimizer = torch.optim.Adam( params=model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, # weight_decay=args.weight_decay amsgrad=False, ) # optimizer = torch.optim.AdamW( # params=model.parameters(), # lr=args.lr, # weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) model_without_ddp.load_state_dict(checkpoint['model']) if not args.evaluate: if 'optimizer' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) if 'lr_scheduler' in checkpoint: lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) if 'epoch' in checkpoint: args.start_epoch = checkpoint['epoch'] + 1 # checkpoint = torch.load(args.resume) # args.start_epoch = checkpoint['epoch'] # model.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) # print("=> loaded checkpoint '{}' (epoch {})" # .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # cudnn.benchmark = True ################################## # Define loss functions (criterion) ################################## # criterion = torch.nn.CrossEntropyLoss().cuda() text_pad_idx = GQATorchDataset.TEXT.vocab.stoi[ GQATorchDataset.TEXT.pad_token] criterion = { "program": torch.nn.CrossEntropyLoss(ignore_index=text_pad_idx).to(device=cuda), "full_answer": torch.nn.CrossEntropyLoss(ignore_index=text_pad_idx).to(device=cuda), "short_answer": torch.nn.CrossEntropyLoss().to(device=cuda), # "short_answer": torch.nn.BCEWithLogitsLoss().to(device=cuda), # sigmoid "execution_bitmap": torch.nn.BCELoss().to(device=cuda), } ################################## # If Evaluate Only ################################## if args.evaluate: validate(val_loader, model, criterion, args, DUMP_RESULT=True) return ################################## # Main Training Loop ################################## # best_acc1 = 0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: ################################## # In distributed mode, calling the :meth`set_epoch(epoch) <set_epoch>` method # at the beginning of each epoch before creating the DataLoader iterator is necessary # to make shuffling work properly across multiple epochs. # Otherwise, the same ordering will be always used. ################################## sampler_train.set_epoch(epoch) lr_scheduler.step() # adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set if (epoch + 1) % 5 == 0: validate(val_loader, model, criterion, args, FAST_VALIDATE_FLAG=False) # # remember best acc@1 and save checkpoint # save_checkpoint({ # 'epoch': epoch + 1, # # 'arch': args.arch, # 'state_dict': model.state_dict(), # # 'best_acc1': best_acc1, # 'optimizer' : optimizer.state_dict(), # }, is_best) if args.output_dir: output_dir = pathlib.Path(args.output_dir) checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path)
def main(args): utils.init_distributed_mode( args ) # 是与分布式训练相关的设置,在该方法里,是通过环境变量来判断是否使用分布式训练,如果是,那么就设置相关参数,具体可参考util/misc.py文件中的源码,这里不作解析。 print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: # 代表是否固定住参数的权重,类似于迁移学习的微调。如果是,那么需要同时指定 masks 参数,代表这种条件仅适用于分割任务。 assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility # 固定随机种子以便复现,get_rank()是分布式节点的编号 seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # 根据参数构建模型,loss函数以及后处理方法 model, criterion, postprocessors = build_model(args) model.to(device) # ddp是DistributeDataParallel的缩写 model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module # 统计并输出可训练的参数数量 n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # 设置优化器、学习率策略以及构建训练和验证集。 # 将backbone与其他部分的参数分开,以便使用不同的初始学习率进行训练 param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # 构造数据集使用的 build_dataset() 方法调用了COCO数据集的api,位于datasets/__init__.py文件 dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) # 构造了数据集后,设置数据集的采样器,并且装在到 DataLoader,以进行批次训练。注意到使用了 collate_fn 方法来重新组装一个batch的数据 # collate_fn在util/misc.py data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: # 类似迁移学习的微调,固定住权重,仅训练分割头 checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) # 用于从历史的某个训练阶段中恢复过来,包括加载当时的模型权重、优化器和学习率以及周期等参数。 output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 # 此参数设置了代表仅进行测试而不进行训练 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return # 接下来真正开始一个个周期地训练,每个周期后根据学习率策略调整下学习率。 print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) # 获取一个周期的训练结果 # 这部分对应的代码在detr/engine.py中的 train_one_epoch() 方法,顾名思义,这部分内容就是模型在一个训练周期中的操作。 train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() # 将训练结果和相关参数记录到指定文件 if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') # 记录训练结果和学习率等参数 for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # 每训练一个周期后再验证集上进行评估验证 test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } # 将训练和验证的结果记录到(分布式)主节点中的指定文件 if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: # 记录评估验证结果 torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) # 最后计算训练的总共耗时并打印,整个训练流程到此结束 total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): if args.gpu_id >= 0: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) if args.neptune: # Connect your script to Neptune import neptune # your NEPTUNE_API_TOKEN should be add to ~./bashrc to run this file neptune.init(project_qualified_name='detectwaste/detr') if args.dilation: exp_name = f"{args.dataset_file}_{args.backbone}_DC" else: exp_name = f"{args.dataset_file}_{args.backbone}" neptune.create_experiment(name=exp_name) else: neptune = None # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] if args.optimizer == 'LaProp': optimizer = LaProp(param_dicts, lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer == 'AdamW': optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) else: sys.exit(f'Choosen optimizer {args.optimizer} is not available.') lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) dataset_test = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) sampler_test = DistributedSampler(dataset_test, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_test = torch.utils.data.SequentialSampler(dataset_test) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_test = DataLoader(dataset_test, args.batch_size, sampler=sampler_test, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') if args.dataset_file in waste_datasets_list and args.start_epoch == 0: # For waste detection datasets - we must cut classification head del checkpoint["model"]["class_embed.weight"] del checkpoint["model"]["class_embed.bias"] del checkpoint["model"]["query_embed.weight"] model_without_ddp.load_state_dict(checkpoint['model'], strict=False) elif args.dataset_file == 'coco': model_without_ddp.load_state_dict(checkpoint['model']) else: model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_test, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm, neptune) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, neptune) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) # sent validation mAP to neptune if "bbox" in coco_evaluator.coco_eval: if args.neptune: neptune.log_metric( 'valid/bbox [email protected]:0.95', coco_evaluator.coco_eval['bbox'].stats[0]) neptune.log_metric( 'valid/bbox [email protected]', coco_evaluator.coco_eval['bbox'].stats[1]) if args.masks: neptune.log_metric( 'valid/segm [email protected]', coco_evaluator.coco_eval['segm'].stats[1]) neptune.log_metric( 'valid/segm [email protected]:0.95', coco_evaluator.coco_eval['segm'].stats[0]) filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) focal_alphas = dataset_train.get_focal_alphas() # dataset_train = dataset_train # dataset_val = dataset_train args.focal_alphas = focal_alphas model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) backbone_params = [] proj_params = [] others = [] for n, p in model_without_ddp.named_parameters(): if 'backbone' in n: backbone_params.append(p) elif 'offset_proj' in n or 'query_ref_point_proj' in n: proj_params.append(p) else: others.append(p) param_dicts = [{ 'params': others }, { 'params': backbone_params, 'lr': args.lr_backbone }, { 'params': proj_params, 'lr': args.lr * 0.1 }] # optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, # weight_decay=args.weight_decay) optimizer = torch.optim.Adam(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return scaler = GradScaler(enabled=args.amp) print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model=model, criterion=criterion, data_loader=data_loader_train, optimizer=optimizer, device=device, epoch=epoch, max_norm=args.clip_max_norm, scaler=scaler, amp=args.amp) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # IPython.embed() # IPython.embed() os.system("sudo chmod -R 777 /home/shuxuang/.cache/") model, criterion, postprocessors = build_model( args) # use the same model as detr paper on coco model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, # weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # dataset_train = build_dataset(image_set='train', args=args) # dataset_val = build_dataset(image_set='val', args=args) # modify the dataset from coco to nvdata # home_dir = os.environ["HOME"] # # on local # dataset_train_ = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/annotation_sql_nvidia'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='train') # dataset_val = build_nvdataset(dataset_root=[ # os.path.join(os.environ["HOME"],'datasets/test'), # os.path.join(os.environ["HOME"], 'datasets/frames_nvidia')], # mode='test', camera=args.camera) dataset_val = build_nvdataset(dataset_root=[ os.path.join(os.environ["HOME"], 'datasets/test'), os.path.join(os.environ["HOME"], 'datasets/test') ], mode='test', camera=args.camera) # indices_50k =np.load(os.path.join(os.environ["HOME"],'datasets/id_1_criterion_Max_SSD_num_labels_50000.npy')) # # on maglev # dataset_train_ = build_nvdataset(dataset_root=[args.dataset_root_sql, args.dataset_root_img], # mode='train') # dataset_val = build_nvdataset(dataset_root=[args.dataset_root_test, args.dataset_root_sql], # mode='test', camera=args.camera) # indices_50k =np.load(os.path.join(args.root_indices)) # dataset_train = Subset(dataset_train_, indices_50k) print("Validation samples: %d" % (len(dataset_val))) # IPython.embed() if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) # args.resume = os.path.join(os.environ["HOME"], 'datasets/exps_detr_base/checkpoint0299.pth') # args.resume = '/home/shuxuang/datasets/exps_detr_base/checkpoint0299.pth' print(args.resume) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: print('Loading model: %s' % args.resume) checkpoint = torch.load(args.resume, map_location='cpu') print('Load model from %d epoch' % (checkpoint['epoch'] + 1)) model_without_ddp.load_state_dict(checkpoint['model']) if args.eval: # if args.dataset_file=='nvdata': # evaluate(model, dataset_val, postprocessors, device) # else: evaluate_5classes(model, dataset_val, postprocessors, device) return model, dataset_val, postprocessors, device
def main(args): utils.init_distributed_mode(args) print('git:\n {}\n'.format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, 'Frozen training is meant for segmentation only' print(args) device = args.device device = device.replace('cuda', 'gpu') device = paddle.set_device(device) seed = args.seed + utils.get_rank() paddle.seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = paddle.DataParallel(model) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [{ 'params': [ p for n, p in model_without_ddp.named_parameters() if 'backbone' not in n and p.requires_grad ] }, { 'params': [ p for n, p in model_without_ddp.named_parameters() if 'backbone' in n and p.requires_grad ], 'lr': args.lr_backbone }] optimizer = torch2paddle.AdamW(param_dicts, lr=args.lr, weight_decay=\ args.weight_decay) lr_scheduler = paddle.optimizer.lr.StepDecay(step_size=args.lr_drop, learning_rate=0.01) optimizer._learning_rate = lr_scheduler dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = paddle.io.RandomSampler(dataset_train) sampler_val = paddle.io.SequenceSampler(dataset_val) batch_sampler_train = paddle.io.BatchSampler(sampler_train, args.batch_size, drop_last=True, dataset=None) data_loader_train = DataLoader(dataset_train, batch_sampler=\ batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args. num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=\ sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == 'coco_panoptic': coco_val = datasets.coco.build('val', args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = paddle.load(args.frozen_weights) model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = paddle.load(args.resume) else: checkpoint = paddle.load(args.resume) model_without_ddp.load_state_dict(checkpoint['model']) if (not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint): optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval['bbox'].eval, output_dir / 'eval.pdiparams') return print('Start training') start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pdiparams'] if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pdiparams') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / 'log.txt').open('a') as f: f.write(json.dumps(log_stats) + '\n') if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if 'bbox' in coco_evaluator.coco_eval: filenames = ['latest.pdiparams'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pdiparams') for name in filenames: paddle.save(coco_evaluator.coco_eval['bbox'].eval, output_dir / 'eval' / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))