def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print('Loading data') dataset_train = build_dataset(args.train_set, args.dataset_year, args) dataset_val = build_dataset(args.val_set, args.dataset_year, args) base_ds = get_coco_api_from_dataset(dataset_val) print('Creating data loaders') if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler( sampler_train, args.batch_size, drop_last=True, ) data_loader_train = DataLoader( dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) data_loader_val = DataLoader( dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) print('Creating model, always set args.return_criterion be True') args.return_criterion = True model = yolov5s(num_classes=args.num_classes) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], ) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD( params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, ) if args.lr_scheduler == 'cosine': lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.t_max) elif args.lr_scheduler == 'multi-step': lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.lr_steps, gamma=args.lr_gamma, ) else: raise ValueError(f'scheduler {args.lr_scheduler} not supported') output_dir = Path(args.output_dir) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.test_only: evaluate(model, data_loader_val, base_ds, device) return print('Start training') start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_one_epoch(model, optimizer, data_loader_train, device, epoch, args.print_freq) lr_scheduler.step() if args.output_dir: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args, 'epoch': epoch, }, output_dir.joinpath(f'model_{epoch}.pth'), ) # evaluate after every epoch # evaluate(model, criterion, data_loader_val, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print(f'Training time {total_time_str}')
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) scaler = torch.cuda.amp.GradScaler(enabled=args.fp16) if args.det_val: assert args.eval, 'only support eval mode of detector for track' model, criterion, postprocessors = build_model(args) elif args.eval: model, criterion, postprocessors = build_tracktest_model(args) else: model, criterion, postprocessors = build_tracktrain_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_train = build_dataset(image_set=args.track_train_split, args=args) dataset_val = build_dataset(image_set=args.track_eval_split, args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: import copy p_groups = copy.deepcopy(optimizer.param_groups) optimizer.load_state_dict(checkpoint['optimizer']) for pg, pg_old in zip(optimizer.param_groups, p_groups): pg['lr'] = pg_old['lr'] pg['initial_lr'] = pg_old['initial_lr'] print(optimizer.param_groups) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). args.override_resumed_lr_drop = True if args.override_resumed_lr_drop: print( 'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.' ) lr_scheduler.step_size = args.lr_drop lr_scheduler.base_lrs = list( map(lambda group: group['initial_lr'], optimizer.param_groups)) lr_scheduler.step(lr_scheduler.last_epoch) args.start_epoch = checkpoint['epoch'] + 1 # check the resumed model # if not args.eval: # test_stats, coco_evaluator, _ = evaluate( # model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir # ) if args.eval: assert args.batch_size == 1, print("Now only support 1.") tracker = Tracker(score_thresh=args.track_thresh) test_stats, coco_evaluator, res_tracks = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, tracker=tracker, phase='eval', det_val=args.det_val, fp16=args.fp16) if args.output_dir: # utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") if res_tracks is not None: print("Creating video index for {}.".format(args.dataset_file)) video_to_images = defaultdict(list) video_names = defaultdict() for _, info in dataset_val.coco.imgs.items(): video_to_images[info["video_id"]].append({ "image_id": info["id"], "frame_id": info["frame_id"] }) video_name = info["file_name"].split("/")[0] if video_name not in video_names: video_names[info["video_id"]] = video_name assert len(video_to_images) == len(video_names) # save mot results. save_track(res_tracks, args.output_dir, video_to_images, video_names, args.track_eval_split) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, scaler, epoch, args.clip_max_norm, fp16=args.fp16) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if epoch % 10 == 0 or epoch > args.epochs - 5: test_stats, coco_evaluator, _ = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, fp16=args.fp16) log_test_stats = { **{f'test_{k}': v for k, v in test_stats.items()} } log_stats.update(log_test_stats) if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs # if coco_evaluator is not None: # (output_dir / 'eval').mkdir(exist_ok=True) # if "bbox" in coco_evaluator.coco_eval: # filenames = ['latest.pth'] # if epoch % 50 == 0: # filenames.append(f'{epoch:03}.pth') # for name in filenames: # torch.save(coco_evaluator.coco_eval["bbox"].eval, # output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) wandb.init(project="qpic-project", entity="sangbaeklee", group="experiment_qpic") wandb.config = { "learning_rate": args.lr, "epochs": args.epochs, "batch_size": args.batch_size, } if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) wandb.watch(model) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if not args.hoi: if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 elif args.pretrained: checkpoint = torch.load(args.pretrained, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if args.eval: if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) return else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if args.hoi: test_stats = evaluate_hoi(args.dataset_file, model, postprocessors, data_loader_val, args.subject_category_id, device) coco_evaluator = None else: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } #import pdb; pdb.set_trace() if args.dataset_file == 'hico': wandb.log({ "loss": train_stats['loss'], "mAP": test_stats['mAP'], "mAP rare": test_stats['mAP rare'], "mAP non-rare": test_stats['mAP non-rare'], "mean max recall": test_stats['mean max recall'] }) elif args.dataset_file == 'vcoco': wandb.log({ "mAP_all": test_stats['mAP_all'], "mAP_thesis": test_stats['mAP_thesis'], "AP_hold_obj": test_stats['AP_hold_obj'], "AP_stand": test_stats['AP_stand'], "AP_sit_instr": test_stats['AP_sit_instr'], "AP_ride_instr": test_stats['AP_ride_instr'], "AP_walk": test_stats['AP_walk'], "AP_look_obj": test_stats['AP_look_obj'], "AP_hit_instr": test_stats['AP_hit_instr'], "AP_hit_obj": test_stats['AP_hit_obj'], "AP_eat_obj": test_stats['AP_eat_obj'], "AP_eat_instr": test_stats['AP_eat_instr'], "AP_jump_instr": test_stats['AP_jump_instr'], "AP_lay_instr": test_stats['AP_lay_instr'], "AP_talk_on_phone_instr": test_stats['AP_talk_on_phone_instr'], "AP_carry_obj": test_stats['AP_carry_obj'], "AP_throw_obj": test_stats['AP_throw_obj'], "AP_catch_obj": test_stats['AP_catch_obj'], "AP_cut_instr": test_stats['AP_cut_instr'], "AP_cut_obj": test_stats['AP_cut_obj'], "AP_run": test_stats['AP_run'], "AP_work_on_computer_instr": test_stats['AP_work_on_computer_instr'], "AP_ski_instr": test_stats['AP_ski_instr'], "AP_surf_instr": test_stats['AP_surf_instr'], "AP_skateboard_instr": test_stats['AP_skateboard_instr'], "AP_smile": test_stats['AP_smile'], "AP_drink_instr": test_stats['AP_drink_instr'], "AP_kick_obj": test_stats['AP_kick_obj'], "AP_point_instr": test_stats['AP_point_instr'], "AP_read_obj": test_stats['AP_read_obj'], "AP_snowboard_instr": test_stats['AP_snowboard_instr'],\ "loss" : train_stats['loss'] }) else: continue if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print("number of params:", n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set="train", args=args) dataset_val = build_dataset(image_set="val", args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader( dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) data_loader_val = DataLoader( dataset_val, args.batch_size if args.batch_size < 4 else 4, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, ) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) elif args.dataset_file in ["cmdd", "cmdc", "wider"]: base_ds = None elif args.dataset_file == "MOT17": base_ds = dataset_val else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location="cpu") model_without_ddp.detr.load_state_dict(checkpoint["model"]) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith("https"): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location="cpu", check_hash=True) else: checkpoint = torch.load(args.resume, map_location="cpu") # NOTE: this is Bruno's hack to load stuff in model_dict = model_without_ddp.state_dict() pretrained_dict = checkpoint["model"] # hack for adding query stuff if ("query_embed.query_embed.weight" in model_dict.keys() and "query_embed.weight" in pretrained_dict.keys()): pretrained_dict[ "query_embed.query_embed.weight"] = pretrained_dict[ "query_embed.weight"] # 1. filter out unnecessary keys pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } # if finetuning skip the linear stuff if args.finetune: pretrained_dict = { k: v for k, v in pretrained_dict.items() if k not in ["class_embed.weight", "class_embed.bias"] } # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load new state dict model_without_ddp.load_state_dict(model_dict) if (not args.eval and not args.load_model_only and "optimizer" in checkpoint and "lr_scheduler" in checkpoint and "epoch" in checkpoint): optimizer.load_state_dict(checkpoint["optimizer"]) lr_scheduler.load_state_dict(checkpoint["lr_scheduler"]) args.start_epoch = checkpoint["epoch"] + 1 if args.eval: if args.test and args.dataset_file == "wider": if args.resume: s = args.resume.split("/")[:-1] output_dir = "/" + os.path.join(*s) else: output_dir = args.output_dir print("SAVING TEST WIDER TO ", output_dir) test_wider( model, criterion, postprocessors, dataset_val, data_loader_val, device, output_dir, ) return test_stats, coco_evaluator = evaluate( model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, dset_file=args.dataset_file, ) if args.output_dir and coco_evaluator is not None: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm, ) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / "checkpoint.pth"] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f"checkpoint{epoch:04}.pth") for checkpoint_path in checkpoint_paths: utils.save_on_master( { "model": model_without_ddp.state_dict(), "optimizer": optimizer.state_dict(), "lr_scheduler": lr_scheduler.state_dict(), "epoch": epoch, "args": args, }, checkpoint_path, ) test_stats, coco_evaluator = evaluate( model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, dset_file=args.dataset_file, ) log_stats = { **{f"train_{k}": v for k, v in train_stats.items()}, **{f"test_{k}": v for k, v in test_stats.items()}, "epoch": epoch, "n_parameters": n_parameters, } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / "eval").mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ["latest.pth"] if epoch % 50 == 0: filenames.append(f"{epoch:03}.pth") for name in filenames: torch.save( coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name, ) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("Training time {}".format(total_time_str))
def main(args): dist.init_distributed_mode(args) if args.dataset_config is not None: # https://stackoverflow.com/a/16878364 d = vars(args) with open(args.dataset_config, "r") as f: cfg = json.load(f) d.update(cfg) print("git:\n {}\n".format(utils.get_sha())) if args.mask_model != "none": args.masks = True print(args) device = torch.device(args.device) seed = args.seed + dist.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) torch.set_deterministic(True) model, criterion, _, _, _ = build_model(args) model.to(device) model_ema = deepcopy(model) if args.ema else None model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print("number of params:", n_parameters) if len(args.combine_datasets_val) == 0: raise RuntimeError("Please provide at leas one validation dataset") Val_all = namedtuple(typename="val_data", field_names=[ "dataset_name", "dataloader", "base_ds", "evaluator_list" ]) val_tuples = [] for dset_name in args.combine_datasets_val: dset = build_dataset(dset_name, image_set=args.split, args=args) sampler = (DistributedSampler(dset, shuffle=False) if args.distributed else torch.utils.data.SequentialSampler(dset)) dataloader = DataLoader( dset, args.batch_size, sampler=sampler, drop_last=False, collate_fn=partial(utils.collate_fn, False), num_workers=args.num_workers, ) base_ds = get_coco_api_from_dataset(dset) val_tuples.append( Val_all(dataset_name=dset_name, dataloader=dataloader, base_ds=base_ds, evaluator_list=None)) if args.load: print("loading from", args.load) if args.resume.startswith("https"): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location="cpu", check_hash=True) else: checkpoint = torch.load(args.resume, map_location="cpu") if "model_ema" in checkpoint: model_without_ddp.load_state_dict(checkpoint["model_ema"], strict=False) else: model_without_ddp.load_state_dict(checkpoint["model"], strict=False) model_ema = deepcopy(model_without_ddp) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith("https"): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location="cpu", check_hash=True) else: checkpoint = torch.load(args.resume, map_location="cpu") model_without_ddp.load_state_dict(checkpoint["model"]) if args.ema: if "model_ema" not in checkpoint: print( "WARNING: ema model not found in checkpoint, resetting to current model" ) model_ema = deepcopy(model_without_ddp) else: model_ema.load_state_dict(checkpoint["model_ema"]) with open(Path(args.gqa_ann_path) / "gqa_answer2id.json", "r") as f: answer2id = json.load(f) with open(Path(args.gqa_ann_path) / "gqa_answer2id_by_type.json", "r") as f: answer2id_by_type = json.load(f) id2answer = {v: k for k, v in answer2id.items()} id2answerbytype = {} for type, answer_dict in answer2id_by_type.items(): curr_reversed_dict = {v: k for k, v in answer2id_by_type[type].items()} id2answerbytype[type] = curr_reversed_dict print("Running evaluation") test_model = model_ema if model_ema is not None else model for i, item in enumerate(val_tuples): evaluator_list = [] evaluator_list.append(GQAEvaluator()) item = item._replace(evaluator_list=evaluator_list) evaluate( test_model, criterion, item.dataloader, item.evaluator_list, device, args.output_dir, args, id2answer, id2answerbytype, ) return
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) # Save our Wandb metadata if not args.no_wb: wandb.init(entity='dl-project', project='dl-final-project', name=args.wb_name, notes=args.wb_notes, reinit=True) wandb.config.epochs = args.epochs device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) # visualize_video(model, postprocessors) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of trainable params:', n_parameters) wandb.config.n_parameters = n_parameters wandb.config.n_trainable_parameters = n_parameters # better name # Log total # of model parameters (including frozen) to W&B n_total_parameters = sum(p.numel() for p in model.parameters()) print('total number of parameters:', n_total_parameters) wandb.config.n_total_parameters = n_total_parameters dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) # For visualization we want the raw images without any normalization or random resizing dataset_val_without_resize = CocoDetection( "data/coco/val2017", annFile="data/coco/annotations/instances_val2017.json", transforms=T.Compose([T.ToTensor()])) # Save metadata about training + val datasets and batch size wandb.config.len_dataset_train = len(dataset_train) wandb.config.len_dataset_val = len(dataset_val) wandb.config.batch_size = args.batch_size if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] # Not sure if we should save all hyperparameters in wandb.config? # just start with a few important ones wandb.config.lr = args.lr wandb.config.lr_backbone = args.lr_backbone if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: import copy p_groups = copy.deepcopy(optimizer.param_groups) optimizer.load_state_dict(checkpoint['optimizer']) for pg, pg_old in zip(optimizer.param_groups, p_groups): pg['lr'] = pg_old['lr'] pg['initial_lr'] = pg_old['initial_lr'] # print(optimizer.param_groups) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). args.override_resumed_lr_drop = True if args.override_resumed_lr_drop: print( 'Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.' ) lr_scheduler.step_size = args.lr_drop lr_scheduler.base_lrs = list( map(lambda group: group['initial_lr'], optimizer.param_groups)) lr_scheduler.step(lr_scheduler.last_epoch) args.start_epoch = checkpoint['epoch'] + 1 # check the resumed model if not args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.eval: print("Generating visualizations...") visualize_bbox(model, postprocessors, data_loader_val, device, dataset_val_without_resize) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_file_for_wb = str( output_dir / f'{wandb.run.id}_checkpoint{epoch:04}.pth') checkpoint_paths = [ output_dir / 'checkpoint.pth', checkpoint_file_for_wb ] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # Save model checkpoint to W&B wandb.save(checkpoint_file_for_wb) # Generate visualizations for fixed(?) set of images every epoch print("Generating visualizations...") visualize_bbox(model, postprocessors, data_loader_val, device, dataset_val_without_resize) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } # Save the COCO metrics properly metric_name = [ "AP", "AP50", "AP75", "APs", "APm", "APl", "AR@1", "AR@10", "AR@100", "ARs", "ARm", "ARl" ] for i, metric_val in enumerate(log_stats["test_coco_eval_bbox"]): log_stats[metric_name[i]] = metric_val if not args.no_wb: wandb.log(log_stats) print("train_loss: ", log_stats['train_loss']) if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") wandb.save(str(output_dir / "log.txt")) # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] eval_filename_for_wb = f'{wandb.run.id}_eval_{epoch:04}.pth' eval_path_for_wb = str(output_dir / "eval" / eval_filename_for_wb) filenames = ['latest.pth', eval_filename_for_wb] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) # TODO not sure if this file will end up being too big # I think it's the COCO precision/recall metrics # in some format... # let's track it just in case to start! wandb.save(eval_path_for_wb) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): if args.gpu_id >= 0: os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) if args.neptune: # Connect your script to Neptune import neptune # your NEPTUNE_API_TOKEN should be add to ~./bashrc to run this file neptune.init(project_qualified_name='detectwaste/detr') if args.dilation: exp_name = f"{args.dataset_file}_{args.backbone}_DC" else: exp_name = f"{args.dataset_file}_{args.backbone}" neptune.create_experiment(name=exp_name) else: neptune = None # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] if args.optimizer == 'LaProp': optimizer = LaProp(param_dicts, lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer == 'AdamW': optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) else: sys.exit(f'Choosen optimizer {args.optimizer} is not available.') lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) dataset_test = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) sampler_test = DistributedSampler(dataset_test, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_test = torch.utils.data.SequentialSampler(dataset_test) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_test = DataLoader(dataset_test, args.batch_size, sampler=sampler_test, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') if args.dataset_file in waste_datasets_list and args.start_epoch == 0: # For waste detection datasets - we must cut classification head del checkpoint["model"]["class_embed.weight"] del checkpoint["model"]["class_embed.bias"] del checkpoint["model"]["query_embed.weight"] model_without_ddp.load_state_dict(checkpoint['model'], strict=False) elif args.dataset_file == 'coco': model_without_ddp.load_state_dict(checkpoint['model']) else: model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_test, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm, neptune) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, neptune) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) # sent validation mAP to neptune if "bbox" in coco_evaluator.coco_eval: if args.neptune: neptune.log_metric( 'valid/bbox [email protected]:0.95', coco_evaluator.coco_eval['bbox'].stats[0]) neptune.log_metric( 'valid/bbox [email protected]', coco_evaluator.coco_eval['bbox'].stats[1]) if args.masks: neptune.log_metric( 'valid/segm [email protected]', coco_evaluator.coco_eval['segm'].stats[1]) neptune.log_metric( 'valid/segm [email protected]:0.95', coco_evaluator.coco_eval['segm'].stats[0]) filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, batch_size=1, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: io.load_frozen(args, model_without_ddp) output_dir = Path(args.output_dir) if args.resume: io.resume(args, model_without_ddp, optimizer, lr_scheduler) elif args.finetune: io.finetune(args, model_without_ddp) if args.eval: if args.output_dir and utils.is_main_process(): io.init_wandb(args.dataset_file + "-detr-eval", model, args, n_parameters=n_parameters) test_stats, evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: io.save_on_master(evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() if args.output_dir and utils.is_main_process(): io.init_wandb(args.dataset_file + "-detr", model, args, n_parameters=n_parameters) for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: io.save_checkpoint(args, model_without_ddp, optimizer, lr_scheduler, epoch) test_stats, evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, epoch) if utils.is_main_process() and args.output_dir: io.log_wandb(train_stats, test_stats) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) # save final model if utils.is_main_process() and args.output_dir: io.save_on_master(model_without_ddp, output_dir / "model_final.pth") print('Training time {}'.format(total_time_str))
def main(args): # Init distributed mode dist.init_distributed_mode(args) # Update dataset specific configs if args.dataset_config is not None: # https://stackoverflow.com/a/16878364 d = vars(args) with open(args.dataset_config, "r") as f: cfg = json.load(f) d.update(cfg) print("git:\n {}\n".format(utils.get_sha())) # Segmentation related if args.mask_model != "none": args.masks = True if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) output_dir = Path(args.output_dir) # fix the seed for reproducibility seed = args.seed + dist.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) torch.set_deterministic(True) # Build the model model, criterion, contrastive_criterion, qa_criterion, weight_dict = build_model( args) model.to(device) assert ( criterion is not None or qa_criterion is not None ), "Error: should train either detection or question answering (or both)" # Get a copy of the model for exponential moving averaged version of the model model_ema = deepcopy(model) if args.ema else None model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print("number of params:", n_parameters) # Set up optimizers param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and "text_encoder" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "text_encoder" in n and p.requires_grad ], "lr": args.text_encoder_lr, }, ] if args.optimizer == "sgd": optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) elif args.optimizer in ["adam", "adamw"]: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) else: raise RuntimeError(f"Unsupported optimizer {args.optimizer}") # Train dataset if len(args.combine_datasets) == 0 and not args.eval: raise RuntimeError("Please provide at least one training dataset") dataset_train, sampler_train, data_loader_train = None, None, None if not args.eval: dataset_train = ConcatDataset([ build_dataset(name, image_set="train", args=args) for name in args.combine_datasets ]) # To handle very big datasets, we chunk it into smaller parts. if args.epoch_chunks > 0: print( "Splitting the training set into {args.epoch_chunks} of size approximately " f" {len(dataset_train) // args.epoch_chunks}") chunks = torch.chunk(torch.arange(len(dataset_train)), args.epoch_chunks) datasets = [ torch.utils.data.Subset(dataset_train, chunk.tolist()) for chunk in chunks ] if args.distributed: samplers_train = [DistributedSampler(ds) for ds in datasets] else: samplers_train = [ torch.utils.data.RandomSampler(ds) for ds in datasets ] batch_samplers_train = [ torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) for sampler_train in samplers_train ] assert len(batch_samplers_train) == len(datasets) data_loaders_train = [ DataLoader( ds, batch_sampler=batch_sampler_train, collate_fn=partial(utils.collate_fn, False), num_workers=args.num_workers, ) for ds, batch_sampler_train in zip(datasets, batch_samplers_train) ] else: if args.distributed: sampler_train = DistributedSampler(dataset_train) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) batch_sampler_train = torch.utils.data.BatchSampler( sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader( dataset_train, batch_sampler=batch_sampler_train, collate_fn=partial(utils.collate_fn, False), num_workers=args.num_workers, ) # Val dataset if len(args.combine_datasets_val) == 0: raise RuntimeError("Please provide at leas one validation dataset") Val_all = namedtuple(typename="val_data", field_names=[ "dataset_name", "dataloader", "base_ds", "evaluator_list" ]) val_tuples = [] for dset_name in args.combine_datasets_val: dset = build_dataset(dset_name, image_set="val", args=args) sampler = (DistributedSampler(dset, shuffle=False) if args.distributed else torch.utils.data.SequentialSampler(dset)) dataloader = DataLoader( dset, args.batch_size, sampler=sampler, drop_last=False, collate_fn=partial(utils.collate_fn, False), num_workers=args.num_workers, ) base_ds = get_coco_api_from_dataset(dset) val_tuples.append( Val_all(dataset_name=dset_name, dataloader=dataloader, base_ds=base_ds, evaluator_list=None)) if args.frozen_weights is not None: if args.resume.startswith("https"): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location="cpu", check_hash=True) else: checkpoint = torch.load(args.resume, map_location="cpu") if "model_ema" in checkpoint and checkpoint["model_ema"] is not None: model_without_ddp.detr.load_state_dict(checkpoint["model_ema"], strict=False) else: model_without_ddp.detr.load_state_dict(checkpoint["model"], strict=False) if args.ema: model_ema = deepcopy(model_without_ddp) # Used for loading weights from another model and starting a training from scratch. Especially useful if # loading into a model with different functionality. if args.load: print("loading from", args.load) checkpoint = torch.load(args.load, map_location="cpu") if "model_ema" in checkpoint: model_without_ddp.load_state_dict(checkpoint["model_ema"], strict=False) else: model_without_ddp.load_state_dict(checkpoint["model"], strict=False) if args.ema: model_ema = deepcopy(model_without_ddp) # Used for resuming training from the checkpoint of a model. Used when training times-out or is pre-empted. if args.resume: if args.resume.startswith("https"): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location="cpu", check_hash=True) else: checkpoint = torch.load(args.resume, map_location="cpu") model_without_ddp.load_state_dict(checkpoint["model"]) if not args.eval and "optimizer" in checkpoint and "epoch" in checkpoint: optimizer.load_state_dict(checkpoint["optimizer"]) args.start_epoch = checkpoint["epoch"] + 1 if args.ema: if "model_ema" not in checkpoint: print( "WARNING: ema model not found in checkpoint, resetting to current model" ) model_ema = deepcopy(model_without_ddp) else: model_ema.load_state_dict(checkpoint["model_ema"]) def build_evaluator_list(base_ds, dataset_name): """Helper function to build the list of evaluators for a given dataset""" evaluator_list = [] if args.no_detection: return evaluator_list iou_types = ["bbox"] if args.masks: iou_types.append("segm") evaluator_list.append( CocoEvaluator(base_ds, tuple(iou_types), useCats=False)) if "refexp" in dataset_name: evaluator_list.append(RefExpEvaluator(base_ds, ("bbox"))) if "clevrref" in dataset_name: evaluator_list.append(ClevrRefEvaluator(base_ds, ("bbox"))) if "flickr" in dataset_name: evaluator_list.append( FlickrEvaluator( args.flickr_dataset_path, subset="test" if args.test else "val", merge_boxes=args.GT_type == "merged", )) if "phrasecut" in dataset_name: evaluator_list.append( PhrasecutEvaluator( "test" if args.test else "miniv", ann_folder=args.phrasecut_orig_ann_path, output_dir=os.path.join(output_dir, "phrasecut_eval"), eval_mask=args.masks, )) return evaluator_list # Runs only evaluation, by default on the validation set unless --test is passed. if args.eval: test_stats = {} test_model = model_ema if model_ema is not None else model for i, item in enumerate(val_tuples): evaluator_list = build_evaluator_list(item.base_ds, item.dataset_name) postprocessors = build_postprocessors(args, item.dataset_name) item = item._replace(evaluator_list=evaluator_list) print(f"Evaluating {item.dataset_name}") curr_test_stats = evaluate( model=test_model, criterion=criterion, contrastive_criterion=contrastive_criterion, qa_criterion=qa_criterion, postprocessors=postprocessors, weight_dict=weight_dict, data_loader=item.dataloader, evaluator_list=item.evaluator_list, device=device, args=args, ) test_stats.update({ item.dataset_name + "_" + k: v for k, v in curr_test_stats.items() }) log_stats = { **{f"test_{k}": v for k, v in test_stats.items()}, "n_parameters": n_parameters, } print(log_stats) return # Runs training and evaluates after every --eval_skip epochs print("Start training") start_time = time.time() best_metric = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.epoch_chunks > 0: sampler_train = samplers_train[epoch % len(samplers_train)] data_loader_train = data_loaders_train[epoch % len(data_loaders_train)] print( f"Starting epoch {epoch // len(data_loaders_train)}, sub_epoch {epoch % len(data_loaders_train)}" ) else: print(f"Starting epoch {epoch}") if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch( model=model, criterion=criterion, contrastive_criterion=contrastive_criterion, qa_criterion=qa_criterion, data_loader=data_loader_train, weight_dict=weight_dict, optimizer=optimizer, device=device, epoch=epoch, args=args, max_norm=args.clip_max_norm, model_ema=model_ema, ) if args.output_dir: checkpoint_paths = [output_dir / "checkpoint.pth"] # extra checkpoint before LR drop and every 2 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 2 == 0: checkpoint_paths.append(output_dir / f"checkpoint{epoch:04}.pth") for checkpoint_path in checkpoint_paths: dist.save_on_master( { "model": model_without_ddp.state_dict(), "model_ema": model_ema.state_dict() if args.ema else None, "optimizer": optimizer.state_dict(), "epoch": epoch, "args": args, }, checkpoint_path, ) if epoch % args.eval_skip == 0: test_stats = {} test_model = model_ema if model_ema is not None else model for i, item in enumerate(val_tuples): evaluator_list = build_evaluator_list(item.base_ds, item.dataset_name) item = item._replace(evaluator_list=evaluator_list) postprocessors = build_postprocessors(args, item.dataset_name) print(f"Evaluating {item.dataset_name}") curr_test_stats = evaluate( model=test_model, criterion=criterion, contrastive_criterion=contrastive_criterion, qa_criterion=qa_criterion, postprocessors=postprocessors, weight_dict=weight_dict, data_loader=item.dataloader, evaluator_list=item.evaluator_list, device=device, args=args, ) test_stats.update({ item.dataset_name + "_" + k: v for k, v in curr_test_stats.items() }) else: test_stats = {} log_stats = { **{f"train_{k}": v for k, v in train_stats.items()}, **{f"test_{k}": v for k, v in test_stats.items()}, "epoch": epoch, "n_parameters": n_parameters, } if args.output_dir and dist.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") if epoch % args.eval_skip == 0: if args.do_qa: metric = test_stats["gqa_accuracy_answer_total_unscaled"] else: metric = np.mean([ v[1] for k, v in test_stats.items() if "coco_eval_bbox" in k ]) if args.output_dir and metric > best_metric: best_metric = metric checkpoint_paths = [output_dir / "BEST_checkpoint.pth"] # extra checkpoint before LR drop and every 100 epochs for checkpoint_path in checkpoint_paths: dist.save_on_master( { "model": model_without_ddp.state_dict(), "optimizer": optimizer.state_dict(), "epoch": epoch, "args": args, }, checkpoint_path, ) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print("Training time {}".format(total_time_str))
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) image_set = 'fewshot' if args.fewshot_finetune else 'train' dataset_train = build_dataset(image_set=image_set, args=args) dataset_val = build_dataset(image_set='val', args=args) dataset_support = build_support_dataset(image_set=image_set, args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) sampler_support = samplers.NodeDistributedSampler(dataset_support) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) sampler_support = samplers.DistributedSampler(dataset_support) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_support = torch.utils.data.RandomSampler(dataset_support) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=False) loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) loader_val = DataLoader(dataset_val, batch_size=args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) loader_support = DataLoader(dataset_support, batch_size=1, sampler=sampler_support, drop_last=False, num_workers=args.num_workers, pin_memory=False) def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) if not args.fewshot_finetune: param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, "initial_lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, "initial_lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, "initial_lr": args.lr * args.lr_linear_proj_mult, }] else: # For few-shot finetune stage, do not train sampling offsets, reference points, and embedding related parameters param_dicts = [ { "params": [p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and \ not match_name_keywords(n, args.lr_linear_proj_names) and \ not match_name_keywords(n, args.embedding_related_names) and p.requires_grad], "lr": args.lr, "initial_lr": args.lr, }, { "params": [p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad], "lr": args.lr_backbone, "initial_lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, weight_decay=args.weight_decay) lr_scheduler = WarmupMultiStepLR(optimizer, args.lr_drop_milestones, gamma=0.1, warmup_epochs=args.warmup_epochs, warmup_factor=args.warmup_factor, warmup_method='linear', last_epoch=args.start_epoch - 1) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.dataset.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] if len(missing_keys) > 0: print('Missing Keys: {}'.format(missing_keys)) if len(unexpected_keys) > 0: print('Unexpected Keys: {}'.format(unexpected_keys)) if args.fewshot_finetune: if args.category_codes_cls_loss: # Re-init weights of novel categories for few-shot finetune novel_class_ids = datasets.get_class_ids(args.dataset_file, type='novel') if args.num_feature_levels == 1: for novel_class_id in novel_class_ids: nn.init.normal_(model_without_ddp.category_codes_cls.L. weight[novel_class_id]) elif args.num_feature_levels > 1: for classifier in model_without_ddp.category_codes_cls: for novel_class_id in novel_class_ids: nn.init.normal_( classifier.L.weight[novel_class_id]) else: raise RuntimeError if args.eval: # Evaluate only base categories test_stats, coco_evaluator = evaluate(args, model, criterion, postprocessors, loader_val, loader_support, base_ds, device, type='base') if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval_base.pth") # Evaluate only novel categories test_stats, coco_evaluator = evaluate(args, model, criterion, postprocessors, loader_val, loader_support, base_ds, device, type='novel') if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval_novel.pth") return print("Start training...") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(args, model, criterion, loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() # Saving Checkpoints after each epoch if args.output_dir and (not args.fewshot_finetune): checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # Saving Checkpoints every args.save_every_epoch epoch(s) if args.output_dir: checkpoint_paths = [] if (epoch + 1) % args.save_every_epoch == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # Evaluation and Logging if (epoch + 1) % args.eval_every_epoch == 0: if 'base' in args.dataset_file: evaltype = 'base' else: evaltype = 'all' if args.fewshot_finetune: evaltype = 'novel' test_stats, coco_evaluator = evaluate(args, model, criterion, postprocessors, loader_val, loader_support, base_ds, device, type=evaltype) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters, 'evaltype': evaltype } if args.output_dir and utils.is_main_process(): with (output_dir / "results.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(split='train', args=args) dataset_val = build_dataset(split='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) elif args.dataset_file == "coco": base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 # if args.eval: # if 'coco' in args.dataset_file: # test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, # data_loader_val, base_ds, device, args.output_dir) # if args.output_dir: # utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") # elif 'anet' == args.dataset_file: # evaluate3d(model, postprocessors, data_loader_val, device, epoch=0) # return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) if epoch % args.eval_freq == 0: if 'coco' in args.dataset_file: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) elif 'anet' == args.dataset_file: evaluate3d(model, postprocessors, data_loader_val, device, epoch)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) output_dir = Path(args.output_dir) print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ {"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]}, { "params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.eval: dataset_val = build_dataset(image_set=args.dataset, args=args) if args.distributed: sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) else: dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.SequentialSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) base_ds = get_coco_api_from_dataset(dataset_val) if args.resume and args.frozen_weights: assert False elif args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) new_state_dict = {} for k in checkpoint['model']: if ("class_embed" in k) or ("bbox_embed" in k) or ("query_embed" in k): continue if ("input_proj" in k) and args.layer1_num != 3: continue new_state_dict[k] = checkpoint['model'][k] # Compare load model and current model current_param = [n for n,p in model_without_ddp.named_parameters()] current_buffer = [n for n,p in model_without_ddp.named_buffers()] load_param = new_state_dict.keys() for p in load_param: if p not in current_param and p not in current_buffer: print(p, 'NOT appear in current model. ') for p in current_param: if p not in load_param: print(p, 'NEW parameter. ') model_without_ddp.load_state_dict(new_state_dict, strict=False) else: checkpoint = torch.load(args.resume, map_location='cpu') # this is to compromise old implementation new_state_dict = {} for k in checkpoint['model']: if "bbox_embed" in k: print("bbox_embed from OLD implementation has been replaced with lines_embed") new_state_dict["lines_embed."+'.'.join(k.split('.')[1:])] = checkpoint['model'][k] else: new_state_dict[k] = checkpoint['model'][k] # compare resume model and current model current_param = [n for n,p in model_without_ddp.named_parameters()] current_buffer = [n for n,p in model_without_ddp.named_buffers()] load_param = new_state_dict.keys() #for p in load_param: #if p not in current_param and p not in current_buffer: #print(p, 'not been loaded to current model. Strict == False?') for p in current_param: if p not in load_param: print(p, 'is a new parameter. Not found from load dict.') # load model model_without_ddp.load_state_dict(new_state_dict) # load optimizer if not args.no_opt and not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) checkpoint['lr_scheduler']['step_size'] = args.lr_drop # change the lr_drop epoch lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 elif args.frozen_weights: checkpoint = torch.load(args.frozen_weights, map_location='cpu') new_state_dict = {} for k in checkpoint['model']: if "bbox_embed" in k: new_state_dict["lines_embed."+'.'.join(k.split('.')[1:])] = checkpoint['model'][k] else: new_state_dict[k] = checkpoint['model'][k] model_without_ddp.letr.load_state_dict(new_state_dict) # params encoder = {k:v for k,v in new_state_dict.items() if "encoder" in k} decoder = {k:v for k,v in new_state_dict.items() if "decoder" in k} class_embed = {k:v for k,v in new_state_dict.items() if "class_embed" in k} line_embed = {k:v for k,v in new_state_dict.items() if "lines_embed" in k} model_without_ddp.load_state_dict(encoder, strict=False) model_without_ddp.load_state_dict(decoder, strict=False) model_without_ddp.load_state_dict(class_embed, strict=False) model_without_ddp.load_state_dict(line_embed, strict=False) print('Finish load frozen_weights') else: print("NO RESUME. TRAIN FROM SCRATCH") if args.eval: test_stats = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, args) #print('checkpoint'+ str(checkpoint['epoch'])) return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, postprocessors, data_loader_train, optimizer, device, epoch, args.clip_max_norm, args) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoints/checkpoint.pth'] # extra checkpoint before LR drop and every several epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % args.save_freq == 0: checkpoint_paths.append(output_dir / f'checkpoints/checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master({ 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir, args) log_stats = {**{f'train_{k}': format(v, ".6f") for k, v in train_stats.items()}, **{f'test_{k}': format(v, ".6f") for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters} if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
random.seed(seed) # %% PREPARE DATA # TODO: Train on different dataset! dataset_train = CocoDetection(args.coco_path + "/train2017", args.coco_path + "/annotations/instances_train2017.json", transforms=make_coco_transforms("train"), return_masks=False) dataset_val = CocoDetection(args.coco_path + "/val2017", args.coco_path + "/annotations/instances_val2017.json", transforms=make_coco_transforms("val"), return_masks=False) base_ds = get_coco_api_from_dataset(dataset_val) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler( sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers)
def main(args): utils.init_distributed_mode( args ) # 是与分布式训练相关的设置,在该方法里,是通过环境变量来判断是否使用分布式训练,如果是,那么就设置相关参数,具体可参考util/misc.py文件中的源码,这里不作解析。 print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: # 代表是否固定住参数的权重,类似于迁移学习的微调。如果是,那么需要同时指定 masks 参数,代表这种条件仅适用于分割任务。 assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility # 固定随机种子以便复现,get_rank()是分布式节点的编号 seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) # 根据参数构建模型,loss函数以及后处理方法 model, criterion, postprocessors = build_model(args) model.to(device) # ddp是DistributeDataParallel的缩写 model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module # 统计并输出可训练的参数数量 n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) # 设置优化器、学习率策略以及构建训练和验证集。 # 将backbone与其他部分的参数分开,以便使用不同的初始学习率进行训练 param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) # 构造数据集使用的 build_dataset() 方法调用了COCO数据集的api,位于datasets/__init__.py文件 dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) # 构造了数据集后,设置数据集的采样器,并且装在到 DataLoader,以进行批次训练。注意到使用了 collate_fn 方法来重新组装一个batch的数据 # collate_fn在util/misc.py data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: # 类似迁移学习的微调,固定住权重,仅训练分割头 checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) # 用于从历史的某个训练阶段中恢复过来,包括加载当时的模型权重、优化器和学习率以及周期等参数。 output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 # 此参数设置了代表仅进行测试而不进行训练 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return # 接下来真正开始一个个周期地训练,每个周期后根据学习率策略调整下学习率。 print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) # 获取一个周期的训练结果 # 这部分对应的代码在detr/engine.py中的 train_one_epoch() 方法,顾名思义,这部分内容就是模型在一个训练周期中的操作。 train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() # 将训练结果和相关参数记录到指定文件 if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') # 记录训练结果和学习率等参数 for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) # 每训练一个周期后再验证集上进行评估验证 test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } # 将训练和验证的结果记录到(分布式)主节点中的指定文件 if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: # 记录评估验证结果 torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) # 最后计算训练的总共耗时并打印,整个训练流程到此结束 total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) dataset_test = build_dataset(image_set='test', args=args) if args.distributed: if args.cache_mode: sampler_train = samplers.NodeDistributedSampler(dataset_train) sampler_val = samplers.NodeDistributedSampler(dataset_val, shuffle=False) else: sampler_train = samplers.DistributedSampler(dataset_train) sampler_val = samplers.DistributedSampler(dataset_val, shuffle=False) sampler_test = samplers.DistributedSampler(dataset_test, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_test = torch.utils.data.SequentialSampler(dataset_test) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) data_loader_test = DataLoader(dataset_test, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers, pin_memory=True) # lr_backbone_names = ["backbone.0", "backbone.neck", "input_proj", "transformer.encoder"] def match_name_keywords(n, name_keywords): out = False for b in name_keywords: if b in n: out = True break return out for n, p in model_without_ddp.named_parameters(): print(n) param_dicts = [{ "params": [ p for n, p in model_without_ddp.named_parameters() if not match_name_keywords(n, args.lr_backbone_names) and not match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_backbone_names) and p.requires_grad ], "lr": args.lr_backbone, }, { "params": [ p for n, p in model_without_ddp.named_parameters() if match_name_keywords(n, args.lr_linear_proj_names) and p.requires_grad ], "lr": args.lr * args.lr_linear_proj_mult, }] if args.sgd: optimizer = torch.optim.SGD(param_dicts, lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') del checkpoint["model"]["transformer.decoder.class_embed.0.weight"] del checkpoint["model"]["transformer.decoder.class_embed.0.bias"] del checkpoint["model"]["transformer.decoder.class_embed.1.weight"] del checkpoint["model"]["transformer.decoder.class_embed.1.bias"] del checkpoint["model"]["transformer.decoder.class_embed.2.weight"] del checkpoint["model"]["transformer.decoder.class_embed.2.bias"] del checkpoint["model"]["transformer.decoder.class_embed.3.weight"] del checkpoint["model"]["transformer.decoder.class_embed.3.bias"] del checkpoint["model"]["transformer.decoder.class_embed.4.weight"] del checkpoint["model"]["transformer.decoder.class_embed.4.bias"] del checkpoint["model"]["transformer.decoder.class_embed.5.weight"] del checkpoint["model"]["transformer.decoder.class_embed.5.bias"] del checkpoint["model"]["transformer.decoder.class_embed.6.weight"] del checkpoint["model"]["transformer.decoder.class_embed.6.bias"] del checkpoint["model"]["class_embed.0.weight"] del checkpoint["model"]["class_embed.0.bias"] del checkpoint["model"]["class_embed.1.weight"] del checkpoint["model"]["class_embed.1.bias"] del checkpoint["model"]["class_embed.2.weight"] del checkpoint["model"]["class_embed.2.bias"] del checkpoint["model"]["class_embed.3.weight"] del checkpoint["model"]["class_embed.3.bias"] del checkpoint["model"]["class_embed.4.weight"] del checkpoint["model"]["class_embed.4.bias"] del checkpoint["model"]["class_embed.5.weight"] del checkpoint["model"]["class_embed.5.bias"] del checkpoint["model"]["class_embed.6.weight"] del checkpoint["model"]["class_embed.6.bias"] missing_keys, unexpected_keys = model_without_ddp.load_state_dict( checkpoint['model'], strict=False) unexpected_keys = [ k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops')) ] # if len(missing_keys) > 0: # print('Missing Keys: {}'.format(missing_keys)) # if len(unexpected_keys) > 0: # print('Unexpected Keys: {}'.format(unexpected_keys)) # if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: # import copy # p_groups = copy.deepcopy(optimizer.param_groups) # optimizer.load_state_dict(checkpoint['optimizer']) # for pg, pg_old in zip(optimizer.param_groups, p_groups): # pg['lr'] = pg_old['lr'] # pg['initial_lr'] = pg_old['initial_lr'] # #print(optimizer.param_groups) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) # # todo: this is a hack for doing experiment that resume from checkpoint and also modify lr scheduler (e.g., decrease lr in advance). # args.override_resumed_lr_drop = True # if args.override_resumed_lr_drop: # print('Warning: (hack) args.override_resumed_lr_drop is set to True, so args.lr_drop would override lr_drop in resumed lr_scheduler.') # lr_scheduler.step_size = args.lr_drop # lr_scheduler.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) # lr_scheduler.step(lr_scheduler.last_epoch) # args.start_epoch = checkpoint['epoch'] + 1 # # check the resumed model if not args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return if args.test: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_test, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 5 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 5 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") return #cab writer = SummaryWriter("runs/" + args.tb_name) best_value = 0 print("Start training, best_value is " + str(best_value)) start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) #cab for k, v in train_stats.items(): if isinstance(v, float): writer.add_scalar(f'train_{k}', v, epoch) new_value = 0 for k, v in test_stats.items(): if (isinstance(v, float)): writer.add_scalar(f'test_{k}', v, epoch) if (k == "coco_eval_bbox"): new_value = v[0] writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', v[0], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ]', v[1], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ]', v[2], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', v[3], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', v[4], epoch) writer.add_scalar( 'Bbox Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', v[5], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ]', v[6], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ]', v[7], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', v[8], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', v[9], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', v[10], epoch) writer.add_scalar( 'Bbox Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', v[11], epoch) if (k == "coco_eval_masks"): new_value = v[0] writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', v[0], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ]', v[1], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ]', v[2], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', v[3], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', v[4], epoch) writer.add_scalar( 'Mask Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', v[5], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ]', v[6], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ]', v[7], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ]', v[8], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ]', v[9], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ]', v[10], epoch) writer.add_scalar( 'Mask Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ]', v[11], epoch) print("Epoch finished, best_value is " + str(best_value)) save_pth = False if best_value < new_value: best_value = new_value save_pth = True if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pth') if save_pth: checkpoint_paths.append(output_dir / f'best.pth') bestLog = open(output_dir / 'best_log.txt', 'w+') bestLog.write(f'Saved model at epoch {epoch:04}\n') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) #/cab log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) if args.output_dir is None: args.output_dir = os.path.expanduser( '~/Data/AI2Thor_detection_features/') if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop) dataset_all = build_dataset(image_set='all', args=args) if args.distributed: sampler_all = DistributedSampler(dataset_all, shuffle=False) else: sampler_all = torch.utils.data.SequentialSampler(dataset_all) data_loader_all = DataLoader(dataset_all, args.batch_size, sampler=sampler_all, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) base_ds = get_coco_api_from_dataset(dataset_all) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model'], strict=False) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 print("Start extracting features") start_time = time.time() extract_feature(model, criterion, postprocessors, data_loader_all, base_ds, device, args.output_dir) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Extracting features time {}'.format(total_time_str)) print('Start combining files') start_time = time.time() data_dir = os.path.expanduser('~/Data/AI2Thor_offline_data_2.0.2/') combine_files(args, data_dir) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Combining files time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] dataset_val = build_dataset(image_set='val', args=args) sampler_val = torch.utils.data.SequentialSampler(dataset_val) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) base_ds = get_coco_api_from_dataset(dataset_val) checkpoint = torch.load(args.resume) model_without_ddp.load_state_dict(checkpoint['model']) model_without_ddp.eval() model_without_ddp.to(device) header = 'Test:' for samples, targets in data_loader_val: samples = samples.to(device) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] outputs = model_without_ddp(samples) probas = outputs['pred_logits'].softmax(-1)[0, :, :-1] keep = probas.max(-1).values > 0.7 bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep]) print(probas[keep], bboxes_scaled)
def main(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, "Frozen training is meant for segmentation only" print(args) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [ { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad ] }, { "params": [ p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad ], "lr": args.lr_backbone, }, ] optimizer = torch.optim.AdamW(param_dicts, lr=args.lr, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop, gamma=0.9) dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, args.batch_size, drop_last=True) data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args.num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == "coco_panoptic": # We also evaluate AP during panoptic training, on original coco DS coco_val = datasets.coco.build("val", args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = torch.load(args.frozen_weights, map_location='cpu') model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) output_dir = output_dir / f"{args.backbone}_{args.transformer_type}" if args.output_dir: output_dir.mkdir(parents=True, exist_ok=True) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval.pth") print("Start training") start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / f'checkpoint_{epoch}.pth'] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch}_extra.pth') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args, }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") # for evaluation logs if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: filenames = ['latest.pth'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pth') for name in filenames: torch.save(coco_evaluator.coco_eval["bbox"].eval, output_dir / "eval" / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(args): utils.init_distributed_mode(args) print('git:\n {}\n'.format(utils.get_sha())) if args.frozen_weights is not None: assert args.masks, 'Frozen training is meant for segmentation only' print(args) device = args.device device = device.replace('cuda', 'gpu') device = paddle.set_device(device) seed = args.seed + utils.get_rank() paddle.seed(seed) np.random.seed(seed) random.seed(seed) model, criterion, postprocessors = build_model(args) model.to(device) model_without_ddp = model if args.distributed: model = paddle.DataParallel(model) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) param_dicts = [{ 'params': [ p for n, p in model_without_ddp.named_parameters() if 'backbone' not in n and p.requires_grad ] }, { 'params': [ p for n, p in model_without_ddp.named_parameters() if 'backbone' in n and p.requires_grad ], 'lr': args.lr_backbone }] optimizer = torch2paddle.AdamW(param_dicts, lr=args.lr, weight_decay=\ args.weight_decay) lr_scheduler = paddle.optimizer.lr.StepDecay(step_size=args.lr_drop, learning_rate=0.01) optimizer._learning_rate = lr_scheduler dataset_train = build_dataset(image_set='train', args=args) dataset_val = build_dataset(image_set='val', args=args) if args.distributed: sampler_train = DistributedSampler(dataset_train) sampler_val = DistributedSampler(dataset_val, shuffle=False) else: sampler_train = paddle.io.RandomSampler(dataset_train) sampler_val = paddle.io.SequenceSampler(dataset_val) batch_sampler_train = paddle.io.BatchSampler(sampler_train, args.batch_size, drop_last=True, dataset=None) data_loader_train = DataLoader(dataset_train, batch_sampler=\ batch_sampler_train, collate_fn=utils.collate_fn, num_workers=args. num_workers) data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=\ sampler_val, drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers) if args.dataset_file == 'coco_panoptic': coco_val = datasets.coco.build('val', args) base_ds = get_coco_api_from_dataset(coco_val) else: base_ds = get_coco_api_from_dataset(dataset_val) if args.frozen_weights is not None: checkpoint = paddle.load(args.frozen_weights) model_without_ddp.detr.load_state_dict(checkpoint['model']) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = paddle.load(args.resume) else: checkpoint = paddle.load(args.resume) model_without_ddp.load_state_dict(checkpoint['model']) if (not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint): optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.eval: test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) if args.output_dir: utils.save_on_master(coco_evaluator.coco_eval['bbox'].eval, output_dir / 'eval.pdiparams') return print('Start training') start_time = time.time() for epoch in range(args.start_epoch, args.epochs): if args.distributed: sampler_train.set_epoch(epoch) train_stats = train_one_epoch(model, criterion, data_loader_train, optimizer, device, epoch, args.clip_max_norm) lr_scheduler.step() if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pdiparams'] if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 100 == 0: checkpoint_paths.append(output_dir / f'checkpoint{epoch:04}.pdiparams') for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'args': args }, checkpoint_path) test_stats, coco_evaluator = evaluate(model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir) log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / 'log.txt').open('a') as f: f.write(json.dumps(log_stats) + '\n') if coco_evaluator is not None: (output_dir / 'eval').mkdir(exist_ok=True) if 'bbox' in coco_evaluator.coco_eval: filenames = ['latest.pdiparams'] if epoch % 50 == 0: filenames.append(f'{epoch:03}.pdiparams') for name in filenames: paddle.save(coco_evaluator.coco_eval['bbox'].eval, output_dir / 'eval' / name) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))