def create_dataloader(path, imgsz, batch_size, stride, single_cls=False, hyp=None, augment=False, cache=False, pad=0.0, rect=False, rank=-1, workers=8, image_weights=False, quad=False, prefix='', shuffle=False): if rect and shuffle: LOGGER.warning('WARNING: --rect is incompatible with DataLoader shuffle, setting shuffle=False') shuffle = False with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP dataset = LoadImagesAndLabels(path, imgsz, batch_size, augment=augment, # augmentation hyp=hyp, # hyperparameters rect=rect, # rectangular batches cache_images=cache, single_cls=single_cls, stride=int(stride), pad=pad, image_weights=image_weights, prefix=prefix) batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count() // DEVICE_COUNT, batch_size if batch_size > 1 else 0, workers]) # number of workers sampler = None if rank == -1 else distributed.DistributedSampler(dataset, shuffle=shuffle) loader = DataLoader if image_weights else InfiniteDataLoader # only DataLoader allows for attribute updates return loader(dataset, batch_size=batch_size, shuffle=shuffle and sampler is None, num_workers=nw, sampler=sampler, pin_memory=True, collate_fn=LoadImagesAndLabels.collate_fn4 if quad else LoadImagesAndLabels.collate_fn), dataset
def cache_labels(self, path=Path('./labels.cache'), prefix=''): # Cache dataset labels, check images and read shapes x = {} # dict nm, nf, ne, nc, msgs = 0, 0, 0, 0, [] # number missing, found, empty, corrupt, messages desc = f"{prefix}Scanning '{path.parent / path.stem}' images and labels..." with Pool(NUM_THREADS) as pool: pbar = tqdm(pool.imap(verify_image_label, zip(self.img_files, self.label_files, repeat(prefix))), desc=desc, total=len(self.img_files)) for im_file, l, shape, segments, nm_f, nf_f, ne_f, nc_f, msg in pbar: nm += nm_f nf += nf_f ne += ne_f nc += nc_f if im_file: x[im_file] = [l, shape, segments] if msg: msgs.append(msg) pbar.desc = f"{desc}{nf} found, {nm} missing, {ne} empty, {nc} corrupt" pbar.close() if msgs: LOGGER.info('\n'.join(msgs)) if nf == 0: LOGGER.warning(f'{prefix}WARNING: No labels found in {path}. See {HELP_URL}') x['hash'] = get_hash(self.label_files + self.img_files) x['results'] = nf, nm, ne, nc, len(self.img_files) x['msgs'] = msgs # warnings x['version'] = self.cache_version # cache version try: np.save(path, x) # save cache for next time path.with_suffix('.cache.npy').rename(path) # remove .npy suffix LOGGER.info(f'{prefix}New cache created: {path}') except Exception as e: LOGGER.warning(f'{prefix}WARNING: Cache directory {path.parent} is not writeable: {e}') # not writeable return x
def autobatch(model, imgsz=640, fraction=0.9, batch_size=16): # Automatically estimate best batch size to use `fraction` of available CUDA memory # Usage: # import torch # from utils.autobatch import autobatch # model = torch.hub.load('ultralytics/yolov5', 'yolov5s', autoshape=False) # print(autobatch(model)) # Check device prefix = colorstr('AutoBatch: ') LOGGER.info(f'{prefix}Computing optimal batch size for --imgsz {imgsz}') device = next(model.parameters()).device # get model device if device.type == 'cpu': LOGGER.info( f'{prefix}CUDA not detected, using default CPU batch-size {batch_size}' ) return batch_size # Inspect CUDA memory gb = 1 << 30 # bytes to GiB (1024 ** 3) d = str(device).upper() # 'CUDA:0' properties = torch.cuda.get_device_properties(device) # device properties t = properties.total_memory / gb # GiB total r = torch.cuda.memory_reserved(device) / gb # GiB reserved a = torch.cuda.memory_allocated(device) / gb # GiB allocated f = t - (r + a) # GiB free LOGGER.info( f'{prefix}{d} ({properties.name}) {t:.2f}G total, {r:.2f}G reserved, {a:.2f}G allocated, {f:.2f}G free' ) # Profile batch sizes batch_sizes = [1, 2, 4, 8, 16] try: img = [torch.zeros(b, 3, imgsz, imgsz) for b in batch_sizes] results = profile(img, model, n=3, device=device) except Exception as e: LOGGER.warning(f'{prefix}{e}') # Fit a solution y = [x[2] for x in results if x] # memory [2] p = np.polyfit(batch_sizes[:len(y)], y, deg=1) # first degree polynomial fit b = int((f * fraction - p[1]) / p[0]) # y intercept (optimal batch size) if None in results: # some sizes failed i = results.index(None) # first fail index if b >= batch_sizes[i]: # y intercept above failure point b = batch_sizes[max(i - 1, 0)] # select prior safe point fraction = np.polyval(p, b) / t # actual fraction predicted LOGGER.info( emojis( f'{prefix}Using batch-size {b} for {d} {t * fraction:.2f}G/{t:.2f}G ({fraction * 100:.0f}%) ✅' )) return b
def run( weights=ROOT / 'yolov5s.pt', # weights path imgsz=640, # inference size (pixels) batch_size=1, # batch size data=ROOT / 'data/coco128.yaml', # dataset.yaml path device='', # cuda device, i.e. 0 or 0,1,2,3 or cpu half=False, # use FP16 half-precision inference ): y, t = [], time.time() formats = export.export_formats() device = select_device(device) for i, (name, f, suffix, gpu) in formats.iterrows( ): # index, (name, file, suffix, gpu-capable) try: if device.type != 'cpu': assert gpu, f'{name} inference not supported on GPU' if f == '-': w = weights # PyTorch format else: w = export.run(weights=weights, imgsz=[imgsz], include=[f], device=device, half=half)[-1] # all others assert suffix in str(w), 'export failed' result = val.run(data, w, batch_size, imgsz, plots=False, device=device, task='benchmark', half=half) metrics = result[ 0] # metrics (mp, mr, map50, map, *losses(box, obj, cls)) speeds = result[2] # times (preprocess, inference, postprocess) y.append([name, metrics[3], speeds[1]]) # mAP, t_inference except Exception as e: LOGGER.warning(f'WARNING: Benchmark failure for {name}: {e}') y.append([name, None, None]) # mAP, t_inference # Print results LOGGER.info('\n') parse_opt() notebook_init() # print system info py = pd.DataFrame( y, columns=['Format', '[email protected]:0.95', 'Inference time (ms)']) LOGGER.info(f'\nBenchmarks complete ({time.time() - t:.2f}s)') LOGGER.info(str(py)) return py
def update(self, i, cap, stream): # Read stream `i` frames in daemon thread n, f, read = 0, self.frames[i], 1 # frame number, frame array, inference every 'read' frame while cap.isOpened() and n < f: n += 1 # _, self.imgs[index] = cap.read() cap.grab() if n % read == 0: success, im = cap.retrieve() if success: self.imgs[i] = im else: LOGGER.warning('WARNING: Video stream unresponsive, please check your IP camera connection.') self.imgs[i] = np.zeros_like(self.imgs[i]) cap.open(stream) # re-open stream if signal was lost time.sleep(0.0) # wait time
def __init__(self, sources='streams.txt', img_size=640, stride=32, auto=True): self.mode = 'stream' self.img_size = img_size self.stride = stride if os.path.isfile(sources): with open(sources) as f: sources = [x.strip() for x in f.read().strip().splitlines() if len(x.strip())] else: sources = [sources] n = len(sources) self.imgs, self.fps, self.frames, self.threads = [None] * n, [0] * n, [0] * n, [None] * n self.sources = [clean_str(x) for x in sources] # clean source names for later self.auto = auto for i, s in enumerate(sources): # index, source # Start thread to read frames from video stream st = f'{i + 1}/{n}: {s}... ' if urlparse(s).hostname in ('www.youtube.com', 'youtube.com', 'youtu.be'): # if source is YouTube video check_requirements(('pafy', 'youtube_dl==2020.12.2')) import pafy s = pafy.new(s).getbest(preftype="mp4").url # YouTube URL s = eval(s) if s.isnumeric() else s # i.e. s = '0' local webcam if s == 0: assert not is_colab(), '--source 0 webcam unsupported on Colab. Rerun command in a local environment.' assert not is_kaggle(), '--source 0 webcam unsupported on Kaggle. Rerun command in a local environment.' cap = cv2.VideoCapture(s) assert cap.isOpened(), f'{st}Failed to open {s}' w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) fps = cap.get(cv2.CAP_PROP_FPS) # warning: may return 0 or nan self.frames[i] = max(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float('inf') # infinite stream fallback self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30 # 30 FPS fallback _, self.imgs[i] = cap.read() # guarantee first frame self.threads[i] = Thread(target=self.update, args=([i, cap, s]), daemon=True) LOGGER.info(f"{st} Success ({self.frames[i]} frames {w}x{h} at {self.fps[i]:.2f} FPS)") self.threads[i].start() LOGGER.info('') # newline # check for common shapes s = np.stack([letterbox(x, self.img_size, stride=self.stride, auto=self.auto)[0].shape for x in self.imgs]) self.rect = np.unique(s, axis=0).shape[0] == 1 # rect inference if all shapes equal if not self.rect: LOGGER.warning('WARNING: Stream shapes differ. For optimal performance supply similarly-shaped streams.')
def autobatch(model, imgsz=640, fraction=0.9, batch_size=16): # Automatically estimate best batch size to use `fraction` of available CUDA memory # Usage: # import torch # from utils.autobatch import autobatch # model = torch.hub.load('ultralytics/yolov5', 'yolov5s', autoshape=False) # print(autobatch(model)) prefix = colorstr('AutoBatch: ') LOGGER.info(f'{prefix}Computing optimal batch size for --imgsz {imgsz}') device = next(model.parameters()).device # get model device if device.type == 'cpu': LOGGER.info( f'{prefix}CUDA not detected, using default CPU batch-size {batch_size}' ) return batch_size gb = 1 << 30 # bytes to GiB (1024 ** 3) d = str(device).upper() # 'CUDA:0' properties = torch.cuda.get_device_properties(device) # device properties t = properties.total_memory / gb # (GiB) r = torch.cuda.memory_reserved(device) / gb # (GiB) a = torch.cuda.memory_allocated(device) / gb # (GiB) f = t - (r + a) # free inside reserved LOGGER.info( f'{prefix}{d} ({properties.name}) {t:.2f}G total, {r:.2f}G reserved, {a:.2f}G allocated, {f:.2f}G free' ) batch_sizes = [1, 2, 4, 8, 16] try: img = [torch.zeros(b, 3, imgsz, imgsz) for b in batch_sizes] y = profile(img, model, n=3, device=device) except Exception as e: LOGGER.warning(f'{prefix}{e}') y = [x[2] for x in y if x] # memory [2] batch_sizes = batch_sizes[:len(y)] p = np.polyfit(batch_sizes, y, deg=1) # first degree polynomial fit b = int((f * fraction - p[1]) / p[0]) # y intercept (optimal batch size) LOGGER.info( f'{prefix}Using batch-size {b} for {d} {t * fraction:.2f}G/{t:.2f}G ({fraction * 100:.0f}%)' ) return b
def run( weights=ROOT / 'yolov5s.pt', # weights path imgsz=640, # inference size (pixels) batch_size=1, # batch size data=ROOT / 'data/coco128.yaml', # dataset.yaml path ): y, t = [], time.time() formats = export.export_formats() for i, (name, f, suffix) in formats.iterrows(): # index, (name, file, suffix) try: w = weights if f == '-' else export.run( weights=weights, imgsz=[imgsz], include=[f], device='cpu')[-1] assert suffix in str(w), 'export failed' result = val.run(data, w, batch_size, imgsz=imgsz, plots=False, device='cpu', task='benchmark') metrics = result[ 0] # metrics (mp, mr, map50, map, *losses(box, obj, cls)) speeds = result[2] # times (preprocess, inference, postprocess) y.append([name, metrics[3], speeds[1]]) # mAP, t_inference except Exception as e: LOGGER.warning(f'WARNING: Benchmark failure for {name}: {e}') y.append([name, None, None]) # mAP, t_inference # Print results LOGGER.info('\n') parse_opt() notebook_init() # print system info py = pd.DataFrame( y, columns=['Format', '[email protected]:0.95', 'Inference time (ms)']) LOGGER.info(f'\nBenchmarks complete ({time.time() - t:.2f}s)') LOGGER.info(str(py)) return py
def train( hyp, # path/to/hyp.yaml or hyp dictionary opt, device, callbacks): save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \ opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze # Directories w = save_dir / 'weights' # weights dir (w.parent if evolve else w).mkdir(parents=True, exist_ok=True) # make dir last, best = w / 'last.pt', w / 'best.pt' # Hyperparameters if isinstance(hyp, str): with open(hyp, errors='ignore') as f: hyp = yaml.safe_load(f) # load hyps dict LOGGER.info( colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) # Save run settings if not evolve: with open(save_dir / 'hyp.yaml', 'w') as f: yaml.safe_dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.safe_dump(vars(opt), f, sort_keys=False) # Loggers data_dict = None if RANK in [-1, 0]: loggers = Loggers(save_dir, weights, opt, hyp, LOGGER) # loggers instance if loggers.wandb: data_dict = loggers.wandb.data_dict if resume: weights, epochs, hyp, batch_size = opt.weights, opt.epochs, opt.hyp, opt.batch_size # Register actions for k in methods(loggers): callbacks.register_action(k, callback=getattr(loggers, k)) # Config plots = not evolve # create plots cuda = device.type != 'cpu' init_seeds(1 + RANK) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict['train'], data_dict['val'] nc = 1 if single_cls else int(data_dict['nc']) # number of classes names = ['item'] if single_cls and len( data_dict['names']) != 1 else data_dict['names'] # class names assert len( names ) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check is_coco = isinstance(val_path, str) and val_path.endswith( 'coco/val2017.txt') # COCO dataset # Model check_suffix(weights, '.pt') # check weights pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(LOCAL_RANK): weights = attempt_download( weights) # download if not found locally ckpt = torch.load(weights, map_location='cpu' ) # load checkpoint to CPU to avoid CUDA memory leak model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create exclude = [ 'anchor' ] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys csd = ckpt['model'].float().state_dict( ) # checkpoint state_dict as FP32 csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(csd, strict=False) # load LOGGER.info( f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}' ) # report else: model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create # Freeze freeze = [ f'model.{x}.' for x in (freeze if len(freeze) > 1 else range(freeze[0])) ] # layers to freeze for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): LOGGER.info(f'freezing {k}') v.requires_grad = False # Image size gs = max(int(model.stride.max()), 32) # grid size (max stride) imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2) # verify imgsz is gs-multiple # Batch size if RANK == -1 and batch_size == -1: # single-GPU only, estimate best batch size batch_size = check_train_batch_size(model, imgsz) loggers.on_params_update({"batch_size": batch_size}) # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}") g0, g1, g2 = [], [], [] # optimizer parameter groups for v in model.modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): # bias g2.append(v.bias) if isinstance(v, nn.BatchNorm2d): # weight (no decay) g0.append(v.weight) elif hasattr(v, 'weight') and isinstance( v.weight, nn.Parameter): # weight (with decay) g1.append(v.weight) if opt.optimizer == 'Adam': optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum elif opt.optimizer == 'AdamW': optimizer = AdamW(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': g1, 'weight_decay': hyp['weight_decay'] }) # add g1 with weight_decay optimizer.add_param_group({'params': g2}) # add g2 (biases) LOGGER.info( f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups " f"{len(g0)} weight (no decay), {len(g1)} weight, {len(g2)} bias") del g0, g1, g2 # Scheduler if opt.cos_lr: lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] else: lf = lambda x: (1 - x / epochs) * (1.0 - hyp['lrf']) + hyp['lrf' ] # linear scheduler = lr_scheduler.LambdaLR( optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # EMA ema = ModelEMA(model) if RANK in [-1, 0] else None # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # EMA if ema and ckpt.get('ema'): ema.ema.load_state_dict(ckpt['ema'].float().state_dict()) ema.updates = ckpt['updates'] # Epochs start_epoch = ckpt['epoch'] + 1 if resume: assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.' if epochs < start_epoch: LOGGER.info( f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs." ) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, csd # DP mode if cuda and RANK == -1 and torch.cuda.device_count() > 1: LOGGER.warning( 'WARNING: DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n' 'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.' ) model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and RANK != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) LOGGER.info('Using SyncBatchNorm()') # Trainloader train_loader, dataset = create_dataloader( train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls, hyp=hyp, augment=True, cache=None if opt.cache == 'val' else opt.cache, rect=opt.rect, rank=LOCAL_RANK, workers=workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '), shuffle=True) mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max()) # max label class nb = len(train_loader) # number of batches assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}' # Process 0 if RANK in [-1, 0]: val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls, hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1, workers=workers * 2, pad=0.5, prefix=colorstr('val: '))[0] if not resume: labels = np.concatenate(dataset.labels, 0) # c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, names, save_dir) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) model.half().float() # pre-reduce anchor precision callbacks.run('on_pretrain_routine_end') # DDP mode if cuda and RANK != -1: model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK) # Model attributes nl = de_parallel( model).model[-1].nl # number of detection layers (to scale hyps) hyp['box'] *= 3 / nl # scale to layers hyp['cls'] *= nc / 80 * 3 / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640)**2 * 3 / nl # scale to image size and layers hyp['label_smoothing'] = opt.label_smoothing model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights( dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training last_opt_step = -1 maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) stopper = EarlyStopping(patience=opt.patience) compute_loss = ComputeLoss(model) # init loss class LOGGER.info( f'Image sizes {imgsz} train, {imgsz} val\n' f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n' f"Logging results to {colorstr('bold', save_dir)}\n" f'Starting training for {epochs} epochs...') for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional, single-GPU only) if opt.image_weights: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Update mosaic border (optional) # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(3, device=device) # mean losses if RANK != -1: train_loader.sampler.set_epoch(epoch) pbar = enumerate(train_loader) LOGGER.info( ('\n' + '%10s' * 7) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'labels', 'img_size')) if RANK in [-1, 0]: pbar = tqdm( pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device)) # loss scaled by batch_size if RANK != -1: loss *= WORLD_SIZE # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize if ni - last_opt_step >= accumulate: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) last_opt_step = ni # Log if RANK in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB) pbar.set_description(('%10s' * 2 + '%10.4g' * 5) % (f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])) callbacks.run('on_train_batch_end', ni, model, imgs, targets, paths, plots, opt.sync_bn) if callbacks.stop_training: return # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for loggers scheduler.step() if RANK in [-1, 0]: # mAP callbacks.run('on_train_epoch_end', epoch=epoch) ema.update_attr(model, include=[ 'yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights' ]) final_epoch = (epoch + 1 == epochs) or stopper.possible_stop if not noval or final_epoch: # Calculate mAP results, maps, _ = val.run(data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=ema.ema, single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, plots=False, callbacks=callbacks, compute_loss=compute_loss) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi log_vals = list(mloss) + list(results) + lr callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi) # Save model if (not nosave) or (final_epoch and not evolve): # if save ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'model': deepcopy(de_parallel(model)).half(), 'ema': deepcopy(ema.ema).half(), 'updates': ema.updates, 'optimizer': optimizer.state_dict(), 'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None, 'date': datetime.now().isoformat() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) if (epoch > 0) and (opt.save_period > 0) and (epoch % opt.save_period == 0): torch.save(ckpt, w / f'epoch{epoch}.pt') del ckpt callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi) # Stop Single-GPU if RANK == -1 and stopper(epoch=epoch, fitness=fi): break # Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576 # stop = stopper(epoch=epoch, fitness=fi) # if RANK == 0: # dist.broadcast_object_list([stop], 0) # broadcast 'stop' to all ranks # Stop DPP # with torch_distributed_zero_first(RANK): # if stop: # break # must break all DDP ranks # end epoch ---------------------------------------------------------------------------------------------------- # end training ----------------------------------------------------------------------------------------------------- if RANK in [-1, 0]: LOGGER.info( f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.' ) for f in last, best: if f.exists(): strip_optimizer(f) # strip optimizers if f is best: LOGGER.info(f'\nValidating {f}...') results, _, _ = val.run( data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=attempt_load(f, device).half(), iou_thres=0.65 if is_coco else 0.60, # best pycocotools results at 0.65 single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, save_json=is_coco, verbose=True, plots=True, callbacks=callbacks, compute_loss=compute_loss) # val best model with plots if is_coco: callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi) callbacks.run('on_train_end', last, best, plots, epoch, results) LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}") torch.cuda.empty_cache() return results
def kmean_anchors(dataset='./data/coco128.yaml', n=9, img_size=640, thr=4.0, gen=1000, verbose=True): """ Creates kmeans-evolved anchors from training dataset Arguments: dataset: path to data.yaml, or a loaded dataset n: number of anchors img_size: image size used for training thr: anchor-label wh ratio threshold hyperparameter hyp['anchor_t'] used for training, default=4.0 gen: generations to evolve anchors using genetic algorithm verbose: print all results Return: k: kmeans evolved anchors Usage: from utils.autoanchor import *; _ = kmean_anchors() """ from scipy.cluster.vq import kmeans npr = np.random thr = 1 / thr def metric(k, wh): # compute metrics r = wh[:, None] / k[None] x = torch.min(r, 1 / r).min(2)[0] # ratio metric # x = wh_iou(wh, torch.tensor(k)) # iou metric return x, x.max(1)[0] # x, best_x def anchor_fitness(k): # mutation fitness _, best = metric(torch.tensor(k, dtype=torch.float32), wh) return (best * (best > thr).float()).mean() # fitness def print_results(k, verbose=True): k = k[np.argsort(k.prod(1))] # sort small to large x, best = metric(k, wh0) bpr, aat = (best > thr).float().mean(), ( x > thr).float().mean() * n # best possible recall, anch > thr s = f'{PREFIX}thr={thr:.2f}: {bpr:.4f} best possible recall, {aat:.2f} anchors past thr\n' \ f'{PREFIX}n={n}, img_size={img_size}, metric_all={x.mean():.3f}/{best.mean():.3f}-mean/best, ' \ f'past_thr={x[x > thr].mean():.3f}-mean: ' for i, x in enumerate(k): s += '%i,%i, ' % (round(x[0]), round(x[1])) if verbose: LOGGER.info(s[:-2]) return k if isinstance(dataset, str): # *.yaml file with open(dataset, errors='ignore') as f: data_dict = yaml.safe_load(f) # model dict from utils.datasets import LoadImagesAndLabels dataset = LoadImagesAndLabels(data_dict['train'], augment=True, rect=True) # Get label wh shapes = img_size * dataset.shapes / dataset.shapes.max(1, keepdims=True) wh0 = np.concatenate( [l[:, 3:5] * s for s, l in zip(shapes, dataset.labels)]) # wh # Filter i = (wh0 < 3.0).any(1).sum() if i: LOGGER.info( f'{PREFIX}WARNING: Extremely small objects found: {i} of {len(wh0)} labels are < 3 pixels in size' ) wh = wh0[(wh0 >= 2.0).any(1)] # filter > 2 pixels # wh = wh * (npr.rand(wh.shape[0], 1) * 0.9 + 0.1) # multiply by random scale 0-1 # Kmeans init try: LOGGER.info( f'{PREFIX}Running kmeans for {n} anchors on {len(wh)} points...') assert n <= len(wh) # apply overdetermined constraint s = wh.std(0) # sigmas for whitening k = kmeans(wh / s, n, iter=30)[0] * s # points assert n == len( k ) # kmeans may return fewer points than requested if wh is insufficient or too similar except Exception: LOGGER.warning( f'{PREFIX}WARNING: switching strategies from kmeans to random init' ) k = np.sort(npr.rand(n * 2)).reshape(n, 2) * img_size # random init wh, wh0 = (torch.tensor(x, dtype=torch.float32) for x in (wh, wh0)) k = print_results(k, verbose=False) # Plot # k, d = [None] * 20, [None] * 20 # for i in tqdm(range(1, 21)): # k[i-1], d[i-1] = kmeans(wh / s, i) # points, mean distance # fig, ax = plt.subplots(1, 2, figsize=(14, 7), tight_layout=True) # ax = ax.ravel() # ax[0].plot(np.arange(1, 21), np.array(d) ** 2, marker='.') # fig, ax = plt.subplots(1, 2, figsize=(14, 7)) # plot wh # ax[0].hist(wh[wh[:, 0]<100, 0],400) # ax[1].hist(wh[wh[:, 1]<100, 1],400) # fig.savefig('wh.png', dpi=200) # Evolve f, sh, mp, s = anchor_fitness( k), k.shape, 0.9, 0.1 # fitness, generations, mutation prob, sigma pbar = tqdm(range(gen), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar for _ in pbar: v = np.ones(sh) while (v == 1 ).all(): # mutate until a change occurs (prevent duplicates) v = ((npr.random(sh) < mp) * random.random() * npr.randn(*sh) * s + 1).clip(0.3, 3.0) kg = (k.copy() * v).clip(min=2.0) fg = anchor_fitness(kg) if fg > f: f, k = fg, kg.copy() pbar.desc = f'{PREFIX}Evolving anchors with Genetic Algorithm: fitness = {f:.4f}' if verbose: print_results(k, verbose) return print_results(k)
def run( weights=ROOT / 'yolov5s.pt', # weights path imgsz=640, # inference size (pixels) batch_size=1, # batch size data=ROOT / 'data/coco128.yaml', # dataset.yaml path device='', # cuda device, i.e. 0 or 0,1,2,3 or cpu half=False, # use FP16 half-precision inference test=False, # test exports only pt_only=False, # test PyTorch only ): y, t = [], time.time() device = select_device(device) for i, (name, f, suffix, gpu) in export.export_formats().iterrows( ): # index, (name, file, suffix, gpu-capable) try: assert i != 9, 'Edge TPU not supported' assert i != 10, 'TF.js not supported' if device.type != 'cpu': assert gpu, f'{name} inference not supported on GPU' # Export if f == '-': w = weights # PyTorch format else: w = export.run(weights=weights, imgsz=[imgsz], include=[f], device=device, half=half)[-1] # all others assert suffix in str(w), 'export failed' # Validate result = val.run(data, w, batch_size, imgsz, plots=False, device=device, task='benchmark', half=half) metrics = result[ 0] # metrics (mp, mr, map50, map, *losses(box, obj, cls)) speeds = result[2] # times (preprocess, inference, postprocess) y.append([ name, round(file_size(w), 1), round(metrics[3], 4), round(speeds[1], 2) ]) # MB, mAP, t_inference except Exception as e: LOGGER.warning(f'WARNING: Benchmark failure for {name}: {e}') y.append([name, None, None, None]) # mAP, t_inference if pt_only and i == 0: break # break after PyTorch # Print results LOGGER.info('\n') parse_opt() notebook_init() # print system info c = ['Format', 'Size (MB)', '[email protected]:0.95', 'Inference time (ms)' ] if map else ['Format', 'Export', '', ''] py = pd.DataFrame(y, columns=c) LOGGER.info(f'\nBenchmarks complete ({time.time() - t:.2f}s)') LOGGER.info(str(py if map else py.iloc[:, :2])) return py