def train(train_loader, model, scheduler, optimizer, epoch, args): global iteration print("{} epoch: \t start training....".format(epoch)) start = time.time() total_loss = [] model.train() model.module.is_training = True model.module.freeze_bn() optimizer.zero_grad() for idx, (images, annotations) in enumerate(train_loader): images = images.cuda().float() annotations = annotations.cuda() classification_loss, regression_loss = model([images, annotations]) classification_loss = classification_loss.mean() regression_loss = regression_loss.mean() loss = classification_loss + regression_loss if bool(loss == 0): print('loss equal zero(0)') continue loss.backward() if (idx + 1) % args.grad_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) optimizer.step() optimizer.zero_grad() total_loss.append(loss.item()) if (iteration % 1000 == 0): print('{} iteration: training ...'.format(iteration)) ans = { 'epoch': epoch, 'iteration': iteration, 'cls_loss': classification_loss.item(), 'reg_loss': regression_loss.item(), 'mean_loss': np.mean(total_loss) } for key, value in ans.items(): print(' {:15s}: {}'.format(str(key), value)) # My copy state = { 'epoch': epoch, 'parser': args, 'state_dict': get_state_dict(model) } torch.save( state, os.path.join(args.save_folder, args.dataset, args.network, "checkpoint_{}_{}.pth".format(epoch, iteration))) iteration += 1 scheduler.step(np.mean(total_loss)) result = {'time': time.time() - start, 'loss': np.mean(total_loss)} for key, value in result.items(): print(' {:15s}: {}'.format(str(key), value))
def load_pretrained_model(self, pretrained_model_file=None, skip=[]): if pretrained_model_file: pretrain_state_dict = get_state_dict(pretrained_model_file) state_dict = self.state_dict() keys = list(state_dict.keys()) for key in keys: if any(s in key for s in skip): continue try: state_dict[key] = pretrain_state_dict[key] except KeyError: print("KeyError: {} dosen't lie in pretrain state dict".format(key)) continue else: state_dict = model_zoo.load_url(model_urls[self.name]) self.load_state_dict(state_dict) pass
def load_pretrained_model(self, pretrained_model_file=None, skip=[]): if pretrained_model_file: pretrain_state_dict = get_state_dict(pretrained_model_file) state_dict = self.state_dict() keys = list(state_dict.keys()) for key in keys: if any(s in key for s in skip): continue try: state_dict[key] = pretrain_state_dict[key] except KeyError: print("KeyError: {} dosen't lie in pretrain state dict".format(key)) continue else: state_dict = model_zoo.load_url('https://download.pytorch.org/models/inception_v3_google-1a9a5a14.pth') self.load_state_dict(state_dict) pass
def main(cfg, state, plot=False): # Dataloaders dataset = LeddartechDataset(cfg, use_test_set=True) test_loader = DataLoader(dataset, batch_size=cfg['TRAINING']['BATCH_SIZE'], num_workers=cfg['TRAINING']['NUM_WORKERS']) print(f"Dataset size: {len(dataset)}") # Model in_channels = dataset.check_number_channels() model = getattr(models, cfg['NEURAL_NET']['NAME'])(cfg, in_channels) print(f"Model size: {model.size_of_net}") if cfg['TRAINING']['DEVICE'] == 'cuda' and torch.cuda.device_count( ) > 1: #Multi GPUs model = torch.nn.DataParallel(model) model.to(cfg['TRAINING']['DEVICE']) print(f"Device set to: {cfg['TRAINING']['DEVICE']}") # Load model state state_dict = get_state_dict(state, device=cfg['TRAINING']['DEVICE']) model.load_state_dict(state_dict) model.eval() # Evaluator engine eval_metrics = {} for metric in cfg['TRAINING']['METRICS']: eval_metrics[metric] = getattr(metrics, metric)( cfg, **cfg['TRAINING']['METRICS'][metric]) evaluator = create_supervised_evaluator(model, metrics=eval_metrics, device=cfg['TRAINING']['DEVICE']) pbar2 = tqdm_logger.ProgressBar(persist=True, desc='Testing') pbar2.attach(evaluator) # Start testing evaluator.run(test_loader) print('Test results: ', evaluator.state.metrics) if plot: for metric in cfg['TRAINING']['METRICS']: if hasattr(eval_metrics[metric], 'make_plot'): eval_metrics[metric].make_plot(evaluator.state.metrics) return evaluator.state.metrics
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: # args.rank = int(os.environ["RANK"]) args.rank = 1 if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) checkpoint = [] if (args.resume is not None): if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) params = checkpoint['parser'] args.num_class = params.num_class args.network = params.network args.start_epoch = params.start_epoch + 1 del params model = EfficientDet(num_classes=args.num_class, network=args.network, W_bifpn=EFFICIENTDET[args.network]['W_bifpn'], D_bifpn=EFFICIENTDET[args.network]['D_bifpn'], D_class=EFFICIENTDET[args.network]['D_class'], gpu=args.gpu) if (args.resume is not None): model.load_state_dict(checkpoint['state_dict']) del checkpoint if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) print('Run with DistributedDataParallel with divice_ids....') else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) print('Run with DistributedDataParallel without device_ids....') elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: print('Run with DataParallel ....') model = torch.nn.DataParallel(model).cuda() # Training dataset train_dataset = [] if (args.dataset == 'VOC'): # train_dataset = VOCDetection(root=args.dataset_root, # transform=get_augumentation(phase='train', width=EFFICIENTDET[args.network]['input_size'], height=EFFICIENTDET[args.network]['input_size'])) train_dataset = VOCDetection(root=args.dataset_root, transform=transforms.Compose([ Normalizer(), Augmenter(), Resizer() ])) elif (args.dataset == 'COCO'): train_dataset = CocoDataset( root_dir=args.dataset_root, set_name='train2017', transform=get_augumentation( phase='train', width=EFFICIENTDET[args.network]['input_size'], height=EFFICIENTDET[args.network]['input_size'])) # train_loader = DataLoader(train_dataset, # batch_size=args.batch_size, # num_workers=args.workers, # shuffle=True, # collate_fn=detection_collate, # pin_memory=True) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.workers, shuffle=True, collate_fn=collater, pin_memory=True) # define loss function (criterion) , optimizer, scheduler optimizer = optim.AdamW(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) cudnn.benchmark = True for epoch in range(args.start_epoch, args.num_epoch): train(train_loader, model, scheduler, optimizer, epoch, args) state = { 'epoch': epoch, 'parser': args, 'state_dict': get_state_dict(model) } torch.save( state, './weights/checkpoint_{}_{}_{}.pth'.format(args.dataset, args.network, epoch))