def train(hyp): epochs = opt.epochs # 300 batch_size = opt.batch_size # 64 #weights = opt.weights # initial training weights random.seed(42) np.random.seed(42) torch_utils.init_seeds(42) # Configure with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict #train_path = data_dict['train'] #test_path = data_dict['val'] nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes # Remove previous results for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Create model config = get_efficientdet_config('tf_efficientdet_d4') # 根据上面的配置生成网络 load_from_pretrained = True if (load_from_pretrained): model = EfficientDet(config, pretrained_backbone=False) # 加载预训练模型 checkpoint = torch.load(r'./tf_efficientdet_d4-5b370b7a.pth', map_location=device) try: exclude = ['running_mean', 'running_var'] #['anchor', ,,'bn','tracked',] checkpoint = { k: v for k, v in checkpoint.items() if k in model.state_dict() and not any(x in k for x in exclude) and model.state_dict()[k].shape == v.shape } model.load_state_dict(checkpoint, strict=False) print('Transferred %g/%g items from ' % (len(checkpoint), len(model.state_dict()))) except KeyError as e: s = " is not compatible with . This may be due to model differences or %s may be out of date. " \ "Please delete or update and try again, or use --weights '' to train from scratch." raise KeyError(s) from e config.num_classes = 1 config.image_size = opt.img_size[0] model.class_net = HeadNet(config, num_outputs=config.num_classes, norm_kwargs=dict(eps=.001, momentum=.01)) else: # load from best,last config.num_classes = 1 config.image_size = opt.img_size[0] model = EfficientDet(config, pretrained_backbone=False) checkpoint = torch.load(r'./weights/last.pt', map_location=device) # model.load_state_dict(checkpoint['model'].model.state_dict()) print("load from last.pt\n") config.loss_type = opt.loss_type model = DetBenchTrain(model, config) print("effDet config:", config) imgsz, imgsz_test = [x for x in opt.img_size ] # verify imgsz are gs-multiples # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): if v.requires_grad: if '.bias' in k: pg0.append(v) #pg2.append(v) # biases #print("bias:",k) elif ('.weight' in k or '.edge_weights' in k) and '.bn' not in k: pg1.append(v) # apply weight decay #print("weight:",k) else: pg0.append(v) # all else #print("else:",k) optimizer = optim.Adam(pg0, lr=hyp['lr0']) if opt.adam else \ optim.RMSprop(pg0, lr=hyp['lr0']) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.9 + 0.1 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Load Model start_epoch, best_fitness = 0, 1000.0 if load_from_pretrained == False: if checkpoint['optimizer_state_dict'] is not None: optimizer.load_state_dict(checkpoint['optimizer_state_dict']) best_fitness = checkpoint['best_summary_loss'] print("load best loss:", best_fitness) if checkpoint['epoch'] is not None: start_epoch = checkpoint['epoch'] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (opt.weights, checkpoint['epoch'], epochs)) epochs += checkpoint['epoch'] # finetune additional epochs del checkpoint # Mixed precision training https://github.com/NVIDIA/apex model.to(device) if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) scheduler.last_epoch = start_epoch - 1 # do not move # Initialize distributed training distribution = False # Trainloader dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset), pin_memory=True, #opt.cache_images, drop_last=True, num_workers=4, collate_fn=collate_fn, ) # Testloader testloader = torch.utils.data.DataLoader( validation_dataset, batch_size=batch_size, num_workers=3, shuffle=False, sampler=SequentialSampler(validation_dataset), pin_memory=True, #opt.cache_images, collate_fn=collate_fn, ) # Exponential moving average ema = torch_utils.ModelEMA(model) #print("!!!!!!!!!!!!!!!!!! type model") #print(type(model)) # Start training t0 = time.time() nb = len(dataloader) #//4 # number of batches n_burn = max(2 * nb, 1e3) # burn-in iterations, max(3 epochs, 1k iterations) maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) #anchor = Anchor_config(config) #anchor.anchors.to(device) #anchor.anchor_labeler.to(device) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() mloss = torch.zeros(3, device='cpu') # mean losses print( ('\n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'cls', 'total', 'targets', 'img_size')) #ss = ('\n' + '%5d' * 7)%(0,0,0,0,0,0,0) pbar = tqdm(enumerate(dataloader), ncols=180, total=nb) # progress bar for i, ( images, targets, image_ids ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) #imgs = imgs.to(device).float() / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 boxes = [target['boxes'].to(device).float() for target in targets] # yxyx? labels = [ target['labels'].to(device).float() for target in targets ] images = torch.stack(images, 0) images = images.to(device) #.float() batch_size = images.shape[0] # Burn-in if ni <= n_burn: xi = [0, n_burn] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.0 + gs) // gs * gs # size sf = sz / max(images.shape[2:]) # scale factor if sf != 1: ns = [ math.ceil(x * sf / gs) * gs for x in images.shape[2:] ] # new shape (stretched to gs-multiple) images = F.interpolate(images, size=ns, mode='bilinear', align_corners=False) total_loss, cls_loss, box_loss = model(images, boxes, labels) total_loss = torch.mean(total_loss) cls_loss = torch.mean(cls_loss) box_loss = torch.mean(box_loss) if not torch.isfinite(total_loss): print('WARNING: non-finite loss, ending training ', cls_loss, box_loss) return results # Backward if mixed_precision: with amp.scale_loss(total_loss, optimizer) as scaled_loss: scaled_loss.backward() else: total_loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() ema.update(model) # Print mloss = (mloss * i + torch.tensor( [box_loss * 50.0, cls_loss, total_loss]).detach()) / ( i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 5) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, boxes[0].shape[0], images.shape[-1]) pbar.set_description(s) if ni < 3: f = 'train_batch%g.jpg' % ni # filename result = plot_images(images=images, targets=boxes, fname=f) # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # mAP final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP result = validation(model=ema.ema, val_loader=testloader, config=config, device=device) #results, maps, times = test.test(opt.data, # batch_size=batch_size, # imgsz=imgsz_test, # save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), # model=ema.ema, # single_cls=opt.single_cls, # dataloader=testloader) print("val:", result.avg) # Write with open(results_file, 'a') as f: f.write( f'[RESULT]:Train loss:{total_loss:.5f} Val. Epoch: {epoch}, summary_loss: {result.avg:.5f} \n' ) # P, R, mAP, F1, test_losses=(GIoU, obj, cls) #if len(opt.name) and opt.bucket: # os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Tensorboard # Update best mAP fi = result.avg #fitness(np.array(results).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi < best_fitness: best_fitness = fi print("best_fit,\n") # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: #with open(results_file, 'r') as f: # create checkpoint #ckpt = {'epoch': epoch, # 'best_fitness': best_fitness, # 'training_results': f.read(), # 'model': ema.ema, # 'optimizer': None if final_epoch else optimizer.state_dict()} ckpt = { 'model': ema.ema, #'model_state_dict': ema.ema.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'best_summary_loss': best_fitness, 'epoch': epoch, } # Save last, best and delete torch.save(ckpt, last) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system( 'gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish #if not opt.evolve: # plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group( ) if distribution and device.type != 'cpu' and torch.cuda.device_count( ) > 1 else None torch.cuda.empty_cache() return results
def validate(args): # might as well try to validate something args.pretrained = args.pretrained or not args.checkpoint args.prefetcher = not args.no_prefetcher args.redundant_bias = not args.no_redundant_bias # create model config = get_efficientdet_config(args.model) config.redundant_bias = args.redundant_bias model = EfficientDet(config) if args.checkpoint: load_checkpoint(model, args.checkpoint) param_count = sum([m.numel() for m in model.parameters()]) print('Model %s created, param count: %d' % (args.model, param_count)) bench = DetBenchEval(model, config) bench = bench.cuda() if has_amp: print('Using AMP mixed precision.') bench = amp.initialize(bench, opt_level='O1') else: print('AMP not installed, running network in FP32.') if args.num_gpu > 1: bench = torch.nn.DataParallel(bench, device_ids=list(range(args.num_gpu))) if 'test' in args.anno: annotation_path = os.path.join(args.data, 'annotations', f'image_info_{args.anno}.json') image_dir = 'test2017' else: annotation_path = os.path.join(args.data, 'annotations', f'instances_{args.anno}.json') image_dir = args.anno dataset = CocoDetection(os.path.join(args.data, image_dir), annotation_path) loader = create_loader(dataset, input_size=config.image_size, batch_size=args.batch_size, use_prefetcher=args.prefetcher, interpolation=args.interpolation, fill_color=args.fill_color, num_workers=args.workers, pin_mem=args.pin_mem) img_ids = [] results = [] model.eval() batch_time = AverageMeter() end = time.time() with torch.no_grad(): for i, (input, target) in enumerate(loader): output = bench(input, target['scale']) output = output.cpu() sample_ids = target['img_id'].cpu() for index, sample in enumerate(output): image_id = int(sample_ids[index]) for det in sample: score = float(det[4]) if score < .001: # stop when below this threshold, scores in descending order break coco_det = dict(image_id=image_id, bbox=det[0:4].tolist(), score=score, category_id=int(det[5])) img_ids.append(image_id) results.append(coco_det) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.log_freq == 0: print( 'Test: [{0:>4d}/{1}] ' 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' .format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, )) json.dump(results, open(args.results, 'w'), indent=4) if 'test' not in args.anno: coco_results = dataset.coco.loadRes(args.results) coco_eval = COCOeval(dataset.coco, coco_results, 'bbox') coco_eval.params.imgIds = img_ids # score only ids we've used coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() return results
def main(): setup_default_logging() args, args_text = _parse_args() args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.device = 'cuda:0' args.world_size = 1 args.rank = 0 # global rank if args.distributed: args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() assert args.rank >= 0 if args.distributed: logging.info( 'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: logging.info('Training with a single process on 1 GPU.') torch.manual_seed(args.seed + args.rank) # create model config = get_efficientdet_config(args.model) config.redundant_bias = args.redundant_bias # redundant conv + BN bias layers (True to match official models) model = EfficientDet(config) model = DetBenchTrain(model, config) # FIXME create model factory, pretrained zoo # model = create_model( # args.model, # pretrained=args.pretrained, # num_classes=args.num_classes, # drop_rate=args.drop, # drop_connect_rate=args.drop_connect, # DEPRECATED, use drop_path # drop_path_rate=args.drop_path, # drop_block_rate=args.drop_block, # global_pool=args.gp, # bn_tf=args.bn_tf, # bn_momentum=args.bn_momentum, # bn_eps=args.bn_eps, # checkpoint_path=args.initial_checkpoint) if args.local_rank == 0: logging.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) model.cuda() optimizer = create_optimizer(args, model) use_amp = False if has_apex and args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') use_amp = True if args.local_rank == 0: logging.info('NVIDIA APEX {}. AMP {}.'.format( 'installed' if has_apex else 'not installed', 'on' if use_amp else 'off')) # optionally resume from a checkpoint resume_state = {} resume_epoch = None if args.resume: resume_state, resume_epoch = resume_checkpoint(_unwrap_bench(model), args.resume) if resume_state and not args.no_resume_opt: if 'optimizer' in resume_state: if args.local_rank == 0: logging.info('Restoring Optimizer state from checkpoint') optimizer.load_state_dict(resume_state['optimizer']) if use_amp and 'amp' in resume_state and 'load_state_dict' in amp.__dict__: if args.local_rank == 0: logging.info('Restoring NVIDIA AMP state from checkpoint') amp.load_state_dict(resume_state['amp']) del resume_state model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '') #resume=args.resume) # FIXME bit of a mess with bench if args.resume: load_checkpoint(_unwrap_bench(model_ema), args.resume, use_ema=True) if args.distributed: if args.sync_bn: try: if has_apex: model = convert_syncbn_model(model) else: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm( model) if args.local_rank == 0: logging.info( 'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using ' 'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.' ) except Exception as e: logging.error( 'Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1' ) if has_apex: model = DDP(model, delay_allreduce=True) else: if args.local_rank == 0: logging.info( "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP." ) model = DDP(model, device_ids=[args.local_rank ]) # can use device str in Torch >= 1.1 # NOTE: EMA model does not need to be wrapped by DDP lr_scheduler, num_epochs = create_scheduler(args, optimizer) start_epoch = 0 if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: logging.info('Scheduled epochs: {}'.format(num_epochs)) train_anno_set = 'train2017' train_annotation_path = os.path.join(args.data, 'annotations', f'instances_{train_anno_set}.json') train_image_dir = train_anno_set dataset_train = CocoDetection(os.path.join(args.data, train_image_dir), train_annotation_path) # FIXME cutmix/mixup worth investigating? # collate_fn = None # if args.prefetcher and args.mixup > 0: # collate_fn = FastCollateMixup(args.mixup, args.smoothing, args.num_classes) loader_train = create_loader( dataset_train, input_size=config.image_size, batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, #re_prob=args.reprob, # FIXME add back various augmentations #re_mode=args.remode, #re_count=args.recount, #re_split=args.resplit, #color_jitter=args.color_jitter, #auto_augment=args.aa, interpolation=args.train_interpolation, #mean=data_config['mean'], #std=data_config['std'], num_workers=args.workers, distributed=args.distributed, #collate_fn=collate_fn, pin_mem=args.pin_mem, ) train_anno_set = 'val2017' train_annotation_path = os.path.join(args.data, 'annotations', f'instances_{train_anno_set}.json') train_image_dir = train_anno_set dataset_eval = CocoDetection(os.path.join(args.data, train_image_dir), train_annotation_path) loader_eval = create_loader( dataset_eval, input_size=config.image_size, batch_size=args.validation_batch_size_multiplier * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=args.interpolation, #mean=data_config['mean'], #std=data_config['std'], num_workers=args.workers, #distributed=args.distributed, pin_mem=args.pin_mem, ) eval_metric = args.eval_metric best_metric = None best_epoch = None saver = None output_dir = '' if args.local_rank == 0: output_base = args.output if args.output else './output' exp_name = '-'.join( [datetime.now().strftime("%Y%m%d-%H%M%S"), args.model]) output_dir = get_outdir(output_base, 'train', exp_name) decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(checkpoint_dir=output_dir, decreasing=decreasing) with open(os.path.join(output_dir, 'args.yaml'), 'w') as f: f.write(args_text) try: for epoch in range(start_epoch, num_epochs): if args.distributed: loader_train.sampler.set_epoch(epoch) train_metrics = train_epoch(epoch, model, loader_train, optimizer, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, use_amp=use_amp, model_ema=model_ema) if args.distributed and args.dist_bn in ('broadcast', 'reduce'): if args.local_rank == 0: logging.info( "Distributing BatchNorm running means and vars") distribute_bn(model, args.world_size, args.dist_bn == 'reduce') eval_metrics = validate(model, loader_eval, args) if model_ema is not None and not args.model_ema_force_cpu: if args.distributed and args.dist_bn in ('broadcast', 'reduce'): distribute_bn(model_ema, args.world_size, args.dist_bn == 'reduce') ema_eval_metrics = validate(model_ema.ema, loader_eval, args, log_suffix=' (EMA)') eval_metrics = ema_eval_metrics if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] best_metric, best_epoch = saver.save_checkpoint( _unwrap_bench(model), optimizer, args, epoch=epoch, model_ema=_unwrap_bench(model_ema), metric=save_metric, use_amp=use_amp) except KeyboardInterrupt: pass if best_metric is not None: logging.info('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))
def do_main(): device = torch.device(f'cuda:{gpu_number}') if torch.cuda.is_available( ) else torch.device('cpu') print(device) print(len(train_boxes_df)) print(len(train_images_df)) # Leave only > 0 print('Leave only train images with boxes (all)') with_boxes_filter = train_images_df[image_id_column].isin( train_boxes_df[image_id_column].unique()) images_val = train_images_df.loc[(train_images_df[fold_column] == fold) & with_boxes_filter, image_id_column].values images_train = train_images_df.loc[(train_images_df[fold_column] != fold) & with_boxes_filter, image_id_column].values print(len(images_train), len(images_val)) train_dataset = WheatDataset(images_train, DIR_TRAIN, train_box_callback, transforms=get_train_transform(), is_test=False) valid_dataset = WheatDataset(images_val, DIR_TRAIN, train_box_callback, transforms=get_valid_transform(), is_test=True) train_data_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_fn) valid_data_loader = DataLoader(valid_dataset, batch_size=inf_batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn) #config = get_efficientdet_config('tf_efficientdet_d4') config = get_efficientdet_config('tf_efficientdet_d5') net = EfficientDet(config, pretrained_backbone=False) #load_weights(net, '../timm-efficientdet-pytorch/efficientdet_d4-5b370b7a.pth') load_weights(net, '../timm-efficientdet-pytorch/efficientdet_d5-ef44aea8.pth') config.num_classes = 1 config.image_size = our_image_size net.class_net = HeadNet(config, num_outputs=config.num_classes, norm_kwargs=dict(eps=.001, momentum=.01)) fold_weights_file = f'{experiment_name}.pth' if os.path.exists(fold_weights_file): # continue training print('Continue training, loading weights: ' + fold_weights_file) load_weights(net, fold_weights_file) model_train = DetBenchTrain(net, config) model_eval = DetBenchEval(net, config) manager = ModelManager(model_train, model_eval, device) weights_file = f'{experiment_name}.pth' manager.run_train(train_data_loader, valid_data_loader, n_epoches=n_epochs, weights_file=weights_file, factor=factor, start_lr=start_lr, min_lr=min_lr, lr_patience=lr_patience, overall_patience=overall_patience, loss_delta=loss_delta) # add tags neptune.log_text('save checkpoints as', weights_file[:-4]) neptune.stop()
def validate(args): # might as well try to validate something args.pretrained = args.pretrained or not args.checkpoint args.prefetcher = not args.no_prefetcher # create model config = get_efficientdet_config(args.model) model = EfficientDet(config) if args.checkpoint: load_checkpoint(model, args.checkpoint) param_count = sum([m.numel() for m in model.parameters()]) logging.info('Model %s created, param count: %d' % (args.model, param_count)) bench = DetBenchEval(model, config) bench.model = bench.model.cuda() if has_amp: bench.model = amp.initialize(bench.model, opt_level='O1') if args.num_gpu > 1: bench.model = torch.nn.DataParallel(bench.model, device_ids=list(range( args.num_gpu))) if 'test' in args.anno: annotation_path = os.path.join(args.data, 'annotations', f'image_info_{args.anno}.json') image_dir = 'test2017' else: annotation_path = os.path.join(args.data, 'annotations', f'instances_{args.anno}.json') image_dir = args.anno dataset = CocoDetection(os.path.join(args.data, image_dir), annotation_path) loader = create_loader(dataset, input_size=config.image_size, batch_size=args.batch_size, use_prefetcher=args.prefetcher, interpolation=args.interpolation, num_workers=args.workers) img_ids = [] results = [] model.eval() batch_time = AverageMeter() end = time.time() with torch.no_grad(): for i, (input, target) in enumerate(loader): output = bench(input, target['img_id'], target['scale']) for batch_out in output: for det in batch_out: image_id = int(det[0]) score = float(det[5]) coco_det = { 'image_id': image_id, 'bbox': det[1:5].tolist(), 'score': score, 'category_id': int(det[6]), } img_ids.append(image_id) results.append(coco_det) # measure elapsed time batch_time.update(time.time() - end) end = time.time() if i % args.log_freq == 0: print( 'Test: [{0:>4d}/{1}] ' 'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s) ' .format( i, len(loader), batch_time=batch_time, rate_avg=input.size(0) / batch_time.avg, )) json.dump(results, open(args.results, 'w'), indent=4) if 'test' not in args.anno: coco_results = dataset.coco.loadRes(args.results) coco_eval = COCOeval(dataset.coco, coco_results, 'bbox') coco_eval.params.imgIds = img_ids # score only ids we've used coco_eval.evaluate() coco_eval.accumulate() coco_eval.summarize() return results
def do_main(): neptune.init('ods/wheat') # Create experiment with defined parameters neptune.create_experiment(name=model_name, params=PARAMS, tags=[experiment_name, experiment_tag], upload_source_files=[os.path.basename(__file__)]) neptune.append_tags(f'fold_{fold}') neptune.append_tags(['grad_accum']) device = torch.device(f'cuda:{gpu_number}') if torch.cuda.is_available( ) else torch.device('cpu') print(device) print(len(train_boxes_df)) print(len(train_images_df)) # Leave only > 0 print('Leave only train images with boxes (validation)') with_boxes_filter = train_images_df[image_id_column].isin( train_boxes_df[image_id_column].unique()) # config models fro train and validation config = get_efficientdet_config('tf_efficientdet_d5') net = EfficientDet(config, pretrained_backbone=False) load_weights(net, '../timm-efficientdet-pytorch/efficientdet_d5-ef44aea8.pth') config.num_classes = 1 config.image_size = our_image_size net.class_net = HeadNet(config, num_outputs=config.num_classes, norm_kwargs=dict(eps=.001, momentum=.01)) model_train = DetBenchTrain(net, config) model_eval = DetBenchEval(net, config) manager = ModelManager(model_train, model_eval, device) images_val = train_images_df.loc[(train_images_df[fold_column] == fold) & with_boxes_filter, image_id_column].values images_train = train_images_df.loc[(train_images_df[fold_column] != fold) & with_boxes_filter, image_id_column].values print( f'\nTrain images:{len(images_train)}, validation images {len(images_val)}' ) # get augs #augs_dict = set_augmentations(our_image_size) # get datasets train_dataset = WheatDataset( image_ids=images_train[:160], image_dir=DIR_TRAIN, boxes_df=train_boxes_df, transforms=get_train_transform(our_image_size), is_test=False) valid_dataset = WheatDataset( image_ids=images_val[:160], image_dir=DIR_TRAIN, boxes_df=train_boxes_df, transforms=get_valid_transform(our_image_size), is_test=True) train_data_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_fn, drop_last=True) valid_data_loader = DataLoader(valid_dataset, batch_size=inf_batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn) weights_file = f'../checkpoints/{model_name}/{experiment_name}.pth' #pretrain_weights_file = f'{checkpoints_dir}/{experiment_name}.pth' #if os.path.exists(pretrain_weights_file): # print(f'Continue training, loading weights from {pretrain_weights_file}') # load_weights(net, pretrain_weights_file) manager.run_train(train_generator=train_data_loader, val_generator=valid_data_loader, n_epoches=n_epochs, weights_file=weights_file, factor=factor, start_lr=start_lr, min_lr=min_lr, lr_patience=lr_patience, overall_patience=overall_patience, loss_delta=loss_delta) # add tags neptune.log_text('save checkpoints as', weights_file[:-4]) neptune.stop()
batch_size=train_batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_fn) valid_data_loader = DataLoader(valid_dataset, batch_size=inf_batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn) #weights_file = 'effdet_model14_fold' + str(fold_) + '.pth' weights_file = '../Weights/effdet_fold_1_model16_alex_fold1.pth' #weights_file = 'effdet_alex_fold0.pth' config = get_efficientdet_config('tf_efficientdet_d5') net = EfficientDet(config, pretrained_backbone=False) config.num_classes = 1 config.image_size = our_image_size net.class_net = HeadNet(config, num_outputs=config.num_classes, norm_kwargs=dict(eps=.001, momentum=.01)) load_weights(net, weights_file) model = DetBenchEval(net, config) manager = ModelManager(model, device) true_list, pred_boxes, pred_scores = manager.predict(valid_data_loader) prob_thresholds = np.linspace( 0.35, 0.45, num=10, endpoint=False) # Prediction thresholds to consider a pixel positive