def main(opt, callbacks=Callbacks()): # Checks if RANK in [-1, 0]: print_args(FILE.stem, opt) check_git_status() check_requirements(exclude=['thop']) # Resume if opt.resume and not check_wandb_resume( opt) and not opt.evolve: # resume an interrupted run ckpt = opt.resume if isinstance( opt.resume, str) else get_latest_run() # specified or most recent path assert os.path.isfile( ckpt), 'ERROR: --resume checkpoint does not exist' with open(Path(ckpt).parent.parent / 'opt.yaml', errors='ignore') as f: opt = argparse.Namespace(**yaml.safe_load(f)) # replace opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate LOGGER.info(f'Resuming training from {ckpt}') else: opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = \ check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project) # checks assert len(opt.cfg) or len( opt.weights), 'either --cfg or --weights must be specified' if opt.evolve: if opt.project == str( ROOT / 'runs/train' ): # if default project name, rename to runs/evolve opt.project = str(ROOT / 'runs/evolve') opt.exist_ok, opt.resume = opt.resume, False # pass resume to exist_ok and disable resume opt.save_dir = str( increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) if LOCAL_RANK != -1: msg = 'is not compatible with YOLOv5 Multi-GPU DDP training' assert not opt.image_weights, f'--image-weights {msg}' assert not opt.evolve, f'--evolve {msg}' assert opt.batch_size != -1, f'AutoBatch with --batch-size -1 {msg}, please pass a valid --batch-size' assert opt.batch_size % WORLD_SIZE == 0, f'--batch-size {opt.batch_size} must be multiple of WORLD_SIZE' assert torch.cuda.device_count( ) > LOCAL_RANK, 'insufficient CUDA devices for DDP command' torch.cuda.set_device(LOCAL_RANK) device = torch.device('cuda', LOCAL_RANK) dist.init_process_group( backend="nccl" if dist.is_nccl_available() else "gloo") # Train if not opt.evolve: train(opt.hyp, opt, device, callbacks) if WORLD_SIZE > 1 and RANK == 0: LOGGER.info('Destroying process group... ') dist.destroy_process_group() # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = { 'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr 'box': (1, 0.02, 0.2), # box loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight 'iou_t': (0, 0.1, 0.7), # IoU training threshold 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore) 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) 'scale': (1, 0.0, 0.9), # image scale (+/- gain) 'shear': (1, 0.0, 10.0), # image shear (+/- deg) 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 'flipud': (1, 0.0, 1.0), # image flip up-down (probability) 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability) 'mosaic': (1, 0.0, 1.0), # image mixup (probability) 'mixup': (1, 0.0, 1.0), # image mixup (probability) 'copy_paste': (1, 0.0, 1.0) } # segment copy-paste (probability) with open(opt.hyp, errors='ignore') as f: hyp = yaml.safe_load(f) # load hyps dict if 'anchors' not in hyp: # anchors commented in hyp.yaml hyp['anchors'] = 3 opt.noval, opt.nosave, save_dir = True, True, Path( opt.save_dir) # only val/save final epoch # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices evolve_yaml, evolve_csv = save_dir / 'hyp_evolve.yaml', save_dir / 'evolve.csv' if opt.bucket: os.system(f'gsutil cp gs://{opt.bucket}/evolve.csv {evolve_csv}' ) # download evolve.csv if exists for _ in range(opt.evolve): # generations to evolve if evolve_csv.exists( ): # if evolve.csv exists: select best hyps and mutate # Select parent(s) parent = 'single' # parent selection method: 'single' or 'weighted' x = np.loadtxt(evolve_csv, ndmin=2, delimiter=',', skiprows=1) n = min(5, len(x)) # number of previous results to consider x = x[np.argsort(-fitness(x))][:n] # top n mutations w = fitness(x) - fitness(x).min() + 1E-6 # weights (sum > 0) if parent == 'single' or len(x) == 1: # x = x[random.randint(0, n - 1)] # random selection x = x[random.choices(range(n), weights=w)[0]] # weighted selection elif parent == 'weighted': x = (x * w.reshape( n, 1)).sum(0) / w.sum() # weighted combination # Mutate mp, s = 0.8, 0.2 # mutation probability, sigma npr = np.random npr.seed(int(time.time())) g = np.array([meta[k][0] for k in hyp.keys()]) # gains 0-1 ng = len(meta) v = np.ones(ng) while all( v == 1 ): # mutate until a change occurs (prevent duplicates) v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) hyp[k] = float(x[i + 7] * v[i]) # mutate # Constrain to limits for k, v in meta.items(): hyp[k] = max(hyp[k], v[1]) # lower limit hyp[k] = min(hyp[k], v[2]) # upper limit hyp[k] = round(hyp[k], 5) # significant digits # Train mutation results = train(hyp.copy(), opt, device, callbacks) callbacks = Callbacks() # Write mutation results print_mutation(results, hyp.copy(), save_dir, opt.bucket) # Plot results plot_evolve(evolve_csv) LOGGER.info( f'Hyperparameter evolution finished {opt.evolve} generations\n' f"Results saved to {colorstr('bold', save_dir)}\n" f'Usage example: $ python train.py --hyp {evolve_yaml}')
type=int, default=-1, help='Log model after every "save_period" epoch') parser.add_argument('--artifact_alias', type=str, default="latest", help='version of dataset artifact to be used') opt = parser.parse_args() # Set DDP variables opt.world_size = int( os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1 set_logging(opt.global_rank) if opt.global_rank in [-1, 0]: check_git_status() check_requirements(exclude=('pycocotools', 'thop')) # Resume wandb_run = check_wandb_resume(opt) if opt.resume and not wandb_run: # resume an interrupted run ckpt = opt.resume if isinstance( opt.resume, str) else get_latest_run() # specified or most recent path assert os.path.isfile( ckpt), 'ERROR: --resume checkpoint does not exist' apriori = opt.global_rank, opt.local_rank with open(Path(ckpt).parent.parent / 'opt.yaml') as f: opt = argparse.Namespace(**yaml.safe_load(f)) # replace opt.cfg, opt.weights, opt.resume, opt.batch_size, opt.global_rank, opt.local_rank = \ '', ckpt, True, opt.total_batch_size, *apriori # reinstate
def trainer(opt): # Set DDP variables opt.total_batch_size = opt.batch_size opt.world_size = int( os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1 set_logging(opt.global_rank) if opt.global_rank in [-1, 0]: check_git_status() # Resume if opt.resume: # resume an interrupted run ckpt = opt.resume if isinstance( opt.resume, str) else get_latest_run() # specified or most recent path log_dir = Path(ckpt).parent.parent # runs/exp0 assert os.path.isfile( ckpt), 'ERROR: --resume checkpoint does not exist' with open(log_dir / 'opt.yaml') as f: opt = argparse.Namespace(**yaml.load( f, Loader=yaml.FullLoader)) # replace opt.cfg, opt.weights, opt.resume = '', ckpt, True logger.info('Resuming training from %s' % ckpt) else: # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml') opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file( opt.cfg), check_file(opt.hyp) # check files assert len(opt.cfg) or len( opt.weights), 'either --cfg or --weights must be specified' opt.img_size.extend( [opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) log_dir = increment_dir(Path(opt.logdir) / 'exp', opt.name) # runs/exp1 device = select_device(opt.device, batch_size=opt.batch_size) # DDP mode if opt.local_rank != -1: assert torch.cuda.device_count() > opt.local_rank torch.cuda.set_device(opt.local_rank) device = torch.device('cuda', opt.local_rank) dist.init_process_group(backend='nccl', init_method='env://') # distributed backend assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' opt.batch_size = opt.total_batch_size // opt.world_size logger.info(opt) with open(opt.hyp) as f: hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps # Train if not opt.evolve: tb_writer = None if opt.global_rank in [-1, 0]: logger.info( 'Start Tensorboard with "tensorboard --logdir %s", view at http://localhost:6006/' % opt.logdir) tb_writer = SummaryWriter(log_dir=log_dir) # runs/exp0 train(hyp, opt, device, tb_writer) # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = { 'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr 'giou': (1, 0.02, 0.2), # GIoU loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight 'iou_t': (0, 0.1, 0.7), # IoU training threshold 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore) 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) 'scale': (1, 0.0, 0.9), # image scale (+/- gain) 'shear': (1, 0.0, 10.0), # image shear (+/- deg) 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 'flipud': (1, 0.0, 1.0), # image flip up-down (probability) 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability) 'mosaic': (1, 0.0, 1.0), # image mixup (probability) 'mixup': (1, 0.0, 1.0) } # image mixup (probability) assert opt.local_rank == -1, 'DDP mode not implemented for --evolve' opt.notest, opt.nosave = True, True # only test/save final epoch # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices yaml_file = Path( 'runs/evolve/hyp_evolved.yaml') # save best result here if opt.bucket: os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists for _ in range(300): # generations to evolve if os.path.exists( 'evolve.txt' ): # if evolve.txt exists: select best hyps and mutate # Select parent(s) parent = 'single' # parent selection method: 'single' or 'weighted' x = np.loadtxt('evolve.txt', ndmin=2) n = min(5, len(x)) # number of previous results to consider x = x[np.argsort(-fitness(x))][:n] # top n mutations w = fitness(x) - fitness(x).min() # weights if parent == 'single' or len(x) == 1: # x = x[random.randint(0, n - 1)] # random selection x = x[random.choices(range(n), weights=w)[0]] # weighted selection elif parent == 'weighted': x = (x * w.reshape( n, 1)).sum(0) / w.sum() # weighted combination # Mutate mp, s = 0.8, 0.2 # mutation probability, sigma npr = np.random npr.seed(int(time.time())) g = np.array([x[0] for x in meta.values()]) # gains 0-1 ng = len(meta) v = np.ones(ng) while all( v == 1 ): # mutate until a change occurs (prevent duplicates) v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) hyp[k] = float(x[i + 7] * v[i]) # mutate # Constrain to limits for k, v in meta.items(): hyp[k] = max(hyp[k], v[1]) # lower limit hyp[k] = min(hyp[k], v[2]) # upper limit hyp[k] = round(hyp[k], 5) # significant digits # Train mutation results = train(hyp.copy(), opt, device) # Write mutation results print_mutation(hyp.copy(), results, yaml_file, opt.bucket) # Plot results plot_evolution(yaml_file) print( 'Hyperparameter evolution complete. Best results saved as: %s\nCommand to train a new model with these ' 'hyperparameters: $ python train.py --hyp %s' % (yaml_file, yaml_file))
def main(opt): set_logging(RANK) if RANK in [-1, 0]: print(colorstr('train: ') + ', '.join(f'{k}={v}' for k, v in vars(opt).items())) check_git_status() check_requirements(exclude=['thop']) # Resume wandb_run = check_wandb_resume(opt) if opt.resume and not wandb_run: # resume an interrupted run ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist' with open(Path(ckpt).parent.parent / 'opt.yaml') as f: opt = argparse.Namespace(**yaml.safe_load(f)) # replace opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate logger.info('Resuming training from %s' % ckpt) else: # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml') opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified' opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) opt.name = 'evolve' if opt.evolve else opt.name opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve)) # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) if LOCAL_RANK != -1: from datetime import timedelta assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command' torch.cuda.set_device(LOCAL_RANK) device = torch.device('cuda', LOCAL_RANK) dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=60)) assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count' assert not opt.image_weights, '--image-weights argument is not compatible with DDP training' # Train if not opt.evolve: train(opt.hyp, opt, device) if WORLD_SIZE > 1 and RANK == 0: _ = [print('Destroying process group... ', end=''), dist.destroy_process_group(), print('Done.')] # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr 'box': (1, 0.02, 0.2), # box loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight 'iou_t': (0, 0.1, 0.7), # IoU training threshold 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore) 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) 'scale': (1, 0.0, 0.9), # image scale (+/- gain) 'shear': (1, 0.0, 10.0), # image shear (+/- deg) 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 'flipud': (1, 0.0, 1.0), # image flip up-down (probability) 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability) 'mosaic': (1, 0.0, 1.0), # image mixup (probability) 'mixup': (1, 0.0, 1.0)} # image mixup (probability) with open(opt.hyp) as f: hyp = yaml.safe_load(f) # load hyps dict assert LOCAL_RANK == -1, 'DDP mode not implemented for --evolve' opt.notest, opt.nosave = True, True # only test/save final epoch # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml' # save best result here if opt.bucket: os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists for _ in range(300): # generations to evolve if Path('evolve.txt').exists(): # if evolve.txt exists: select best hyps and mutate # Select parent(s) parent = 'single' # parent selection method: 'single' or 'weighted' x = np.loadtxt('evolve.txt', ndmin=2) n = min(5, len(x)) # number of previous results to consider x = x[np.argsort(-fitness(x))][:n] # top n mutations w = fitness(x) - fitness(x).min() + 1E-6 # weights (sum > 0) if parent == 'single' or len(x) == 1: # x = x[random.randint(0, n - 1)] # random selection x = x[random.choices(range(n), weights=w)[0]] # weighted selection elif parent == 'weighted': x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination # Mutate mp, s = 0.8, 0.2 # mutation probability, sigma npr = np.random npr.seed(int(time.time())) g = np.array([x[0] for x in meta.values()]) # gains 0-1 ng = len(meta) v = np.ones(ng) while all(v == 1): # mutate until a change occurs (prevent duplicates) v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) hyp[k] = float(x[i + 7] * v[i]) # mutate # Constrain to limits for k, v in meta.items(): hyp[k] = max(hyp[k], v[1]) # lower limit hyp[k] = min(hyp[k], v[2]) # upper limit hyp[k] = round(hyp[k], 5) # significant digits # Train mutation results = train(hyp.copy(), opt, device) # Write mutation results print_mutation(hyp.copy(), results, yaml_file, opt.bucket) # Plot results plot_evolution(yaml_file) print(f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n' f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}')
def train(self, ep=0): # ep = 66 input parser = argparse.ArgumentParser() parser.add_argument('--weights', type=str, default=config.weight, help='initial weights path') parser.add_argument('--cfg', type=str, default=config.config_model, help='models/yolov5s.yaml path') parser.add_argument('--data', type=str, default=config.config_data, help='data.yaml path') parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path') parser.add_argument('--epochs', type=int, default=config.epochs) parser.add_argument('--batch-size', type=int, default=config.batch_size, help='total batch size for all GPUs') parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes') parser.add_argument('--rect', action='store_true', help='rectangular training') parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training') parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') parser.add_argument('--notest', action='store_true', help='only test final epoch') parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check') # parser.add_argument('--noautoanchor', type=bool, default=True, help='disable autoanchor check') parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters') # parser.add_argument('--evolve', type = bool, default = False, help='evolve hyperparameters') parser.add_argument('--bucket', type=str, default='', help='gsutil bucket') parser.add_argument('--cache-images', action='store_true', help='cache images for faster training') parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training') parser.add_argument( '--device', default=config.device, help='cuda device, i.e. 0 or 0,1,2,3 or cpu') # using GPU 0 parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%') parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class') # parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer') parser.add_argument( '--adam', type=int, default=config.adam, help='use torch.optim.Adam() optimizer') # using Adam optimizer parser.add_argument( '--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') parser.add_argument('--log-imgs', type=int, default=16, help='number of images for W&B logging, max 100') parser.add_argument('--log-artifacts', action='store_true', help='log artifacts, i.e. final trained model') parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers') parser.add_argument('--project', default=config.project_train, help='save to project/name') parser.add_argument('--name', default=config.name, help='save to project/name') # parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') parser.add_argument('--exist-ok', type=int, default=config.exist_ok, help='existing project/name ok, do not increment') opt = parser.parse_args() # Set DDP variables opt.total_batch_size = opt.batch_size #batch_size = 1 opt.world_size = int( os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 opt.global_rank = int( os.environ['RANK']) if 'RANK' in os.environ else -1 set_logging(opt.global_rank) if opt.global_rank in [-1, 0]: check_git_status() # Resume 恢复训练标志 if opt.resume: # resume an interrupted run ckpt = opt.resume if isinstance( opt.resume, str) else get_latest_run() # specified or most recent path assert os.path.isfile( ckpt), 'ERROR: --resume checkpoint does not exist' with open(Path(ckpt).parent.parent / 'opt.yaml') as f: opt = argparse.Namespace(**yaml.load( f, Loader=yaml.FullLoader)) # replace opt.cfg, opt.weights, opt.resume = '', ckpt, True logger.info('Resuming training from %s' % ckpt) else: # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml') opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file( opt.cfg), check_file(opt.hyp) # check files # opt.data : data/gun.yaml # opt.cfg : modules/yolo_gun.yaml # opt.hyp : data/hyp.scratch.yaml assert len(opt.cfg) or len( opt.weights), 'either --cfg or --weights must be specified' opt.img_size.extend( [opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) # opt.img_size: todo:没看懂在做什么 opt.name = 'evolve' if opt.evolve else opt.name opt.save_dir = increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve) # increment run # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) if opt.local_rank != -1: assert torch.cuda.device_count() > opt.local_rank torch.cuda.set_device(opt.local_rank) device = torch.device('cuda', opt.local_rank) dist.init_process_group( backend='nccl', init_method='env://') # distributed backend assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' opt.batch_size = opt.total_batch_size // opt.world_size # Hyperparameters超参数 打开hpy.scratch.yml 文件 with open(opt.hyp) as f: hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps if 'box' not in hyp: warn( 'Compatibility: %s missing "box" which was renamed from "giou" in %s' % (opt.hyp, 'https://github.com/ultralytics/yolov5/pull/1120')) hyp['box'] = hyp.pop('giou') # Train logger.info(opt) if not opt.evolve: tb_writer = None # init loggers if opt.global_rank in [-1, 0]: logger.info( f'Start Tensorboard with "tensorboard --logdir {opt.project}", view at http://localhost:6006/' ) tb_writer = SummaryWriter(opt.save_dir) # Tensorboard train(hyp, opt, device, tb_writer, wandb, ep)
def main(override_args=None): parser = argparse.ArgumentParser() parser.add_argument('--weights', type=str, default='', help='initial weights path') parser.add_argument('--cfg', type=str, default='', help='model.yaml path') parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path') parser.add_argument('--epochs', type=int, default=300) parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs') parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes') parser.add_argument('--rect', action='store_true', help='rectangular training') parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training') parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') parser.add_argument('--notest', action='store_true', help='only test final epoch') parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check') parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters') parser.add_argument('--bucket', type=str, default='', help='gsutil bucket') parser.add_argument('--cache-images', action='store_true', help='cache images for faster training') parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training') parser.add_argument( '--name', default='', help='renames experiment folder exp{N} to exp{N}_{name} if supplied') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%') parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset') parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer') parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') parser.add_argument('--logdir', type=str, default='runs/', help='logging directory') parser.add_argument('--log-imgs', type=int, default=10, help='number of images for W&B logging, max 100') parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers') # changed/added parameters: parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path OR path to the HHI Json Dataset') # here you can set the path to the "yaml" file that describes the dataset, just as before # but if you give a path to a JSON file, then it will be read and parsed as a HHI Json Format Dataset metadata file parser.add_argument( '--val-size', type=float, default=0.05, help='fraction of the Json Dataset, used for validation') # if using the HHI Json Format Dataset format, this sets the ratio of the dataset, that will be used for validation (random samples) parser.add_argument( '--classes', type=str, default='rectangle', help='limit training to only these class or class list', nargs="+") # if using the HHI Json Format Dataset, normally all annotations of the 'rectangle' type will be used for training, # but you can limit the training to only specific classes, and give them as a space-separated list # (use ""-quotation marks, if a class name has whitespaces), e.g.: # python train.py --data ... --classes gec_object bad_gec_object "screw hole" hand "open hand" --val_size 0.05 ... opt = parser.parse_args() # --------------- # override the command-line parameters: if override_args is not None: opt.data = override_args.data opt.cfg = override_args.cfg opt.multi_scale = override_args.multi_scale opt.val_size = override_args.val_size opt.epochs = override_args.epochs opt.batch_size = override_args.batch_size opt.classes = override_args.classes opt.device = override_args.device # --------------- # Set DDP variables opt.total_batch_size = opt.batch_size opt.world_size = int( os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1 set_logging(opt.global_rank) if opt.global_rank in [-1, 0]: check_git_status() # Resume if opt.resume: # resume an interrupted run ckpt = opt.resume if isinstance( opt.resume, str) else get_latest_run() # specified or most recent path log_dir = Path(ckpt).parent.parent # runs/exp0 assert os.path.isfile( ckpt), 'ERROR: --resume checkpoint does not exist' with open(log_dir / 'opt.yaml') as f: opt = argparse.Namespace(**yaml.load( f, Loader=yaml.FullLoader)) # replace opt.cfg, opt.weights, opt.resume = '', ckpt, True logger.info('Resuming training from %s' % ckpt) else: # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml') opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file( opt.cfg), check_file(opt.hyp) # check files assert len(opt.cfg) or len( opt.weights), 'either --cfg or --weights must be specified' opt.img_size.extend( [opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) log_dir = increment_dir(Path(opt.logdir) / 'exp', opt.name) # runs/exp1 # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) if opt.local_rank != -1: assert torch.cuda.device_count() > opt.local_rank torch.cuda.set_device(opt.local_rank) device = torch.device('cuda', opt.local_rank) dist.init_process_group(backend='nccl', init_method='env://') # distributed backend assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' opt.batch_size = opt.total_batch_size // opt.world_size # Hyperparameters with open(opt.hyp) as f: hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps if 'box' not in hyp: warn( 'Compatibility: %s missing "box" which was renamed from "giou" in %s' % (opt.hyp, 'https://github.com/ultralytics/yolov5/pull/1120')) hyp['box'] = hyp.pop('giou') # Train logger.info(opt) if not opt.evolve: tb_writer, wandb = None, None # init loggers if opt.global_rank in [-1, 0]: # Tensorboard logger.info( f'Start Tensorboard with "tensorboard --logdir {opt.logdir}", view at http://localhost:6006/' ) tb_writer = SummaryWriter(log_dir=log_dir) # runs/exp0 # W&B try: import wandb assert os.environ.get('WANDB_DISABLED') != 'true' logger.info( "Weights & Biases logging enabled, to disable set os.environ['WANDB_DISABLED'] = 'true'" ) except (ImportError, AssertionError): opt.log_imgs = 0 logger.info( "Install Weights & Biases for experiment logging via 'pip install wandb' (recommended)" ) train(hyp, opt, device, tb_writer, wandb) # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = { 'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr 'box': (1, 0.02, 0.2), # box loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight 'iou_t': (0, 0.1, 0.7), # IoU training threshold 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore) 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) 'scale': (1, 0.0, 0.9), # image scale (+/- gain) 'shear': (1, 0.0, 10.0), # image shear (+/- deg) 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 'flipud': (1, 0.0, 1.0), # image flip up-down (probability) 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability) 'mosaic': (1, 0.0, 1.0), # image mixup (probability) 'mixup': (1, 0.0, 1.0) } # image mixup (probability) assert opt.local_rank == -1, 'DDP mode not implemented for --evolve' opt.notest, opt.nosave = True, True # only test/save final epoch # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices yaml_file = Path( opt.logdir ) / 'evolve' / 'hyp_evolved.yaml' # save best result here if opt.bucket: os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists for _ in range(300): # generations to evolve if os.path.exists( 'evolve.txt' ): # if evolve.txt exists: select best hyps and mutate # Select parent(s) parent = 'single' # parent selection method: 'single' or 'weighted' x = np.loadtxt('evolve.txt', ndmin=2) n = min(5, len(x)) # number of previous results to consider x = x[np.argsort(-fitness(x))][:n] # top n mutations w = fitness(x) - fitness(x).min() # weights if parent == 'single' or len(x) == 1: # x = x[random.randint(0, n - 1)] # random selection x = x[random.choices(range(n), weights=w)[0]] # weighted selection elif parent == 'weighted': x = (x * w.reshape( n, 1)).sum(0) / w.sum() # weighted combination # Mutate mp, s = 0.8, 0.2 # mutation probability, sigma npr = np.random npr.seed(int(time.time())) g = np.array([x[0] for x in meta.values()]) # gains 0-1 ng = len(meta) v = np.ones(ng) while all( v == 1 ): # mutate until a change occurs (prevent duplicates) v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) hyp[k] = float(x[i + 7] * v[i]) # mutate # Constrain to limits for k, v in meta.items(): hyp[k] = max(hyp[k], v[1]) # lower limit hyp[k] = min(hyp[k], v[2]) # upper limit hyp[k] = round(hyp[k], 5) # significant digits # Train mutation results = train(hyp.copy(), opt, device) # Write mutation results print_mutation(hyp.copy(), results, yaml_file, opt.bucket) # Plot results plot_evolution(yaml_file) print( f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n' f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}' )
def run(opt: DictConfig) -> None: print(opt) # Set DDP variables opt.world_size = int( os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 opt.global_rank = int(os.environ["RANK"]) if "RANK" in os.environ else -1 set_logging(opt.global_rank) if opt.global_rank in [-1, 0]: os.chdir( "/content/drive/My Drive/Colab Notebooks/AITraining/yolo/yolov5/") check_git_status() check_requirements() # Resume if opt.resume: # resume an interrupted run ckpt = ( opt.resume if isinstance(opt.resume, str) else get_latest_run() ) # specified or most recent path assert os.path.isfile( ckpt), "ERROR: --resume checkpoint does not exist" apriori = opt.global_rank, opt.local_rank with open(Path(ckpt).parent.parent / "opt.yaml") as f: opt = argparse.Namespace(**yaml.load( f, Loader=yaml.SafeLoader)) # replace ( opt.cfg, opt.weights, opt.resume, opt.batch_size, opt.global_rank, opt.local_rank, ) = ( "", ckpt, True, opt.total_batch_size, *apriori, ) # reinstate logger.info("Resuming training from %s" % ckpt) else: # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml') opt.data, opt.cfg, opt.hyp = ( check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp), ) # check files assert len(opt.cfg) or len( opt.weights), "either --cfg or --weights must be specified" opt.img_size.extend( [opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) opt.name = "evolve" if opt.evolve else opt.name opt.save_dir = increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve) # increment run # DDP mode opt.total_batch_size = opt.batch_size device = select_device(opt.device, batch_size=opt.batch_size) if opt.local_rank != -1: assert torch.cuda.device_count() > opt.local_rank torch.cuda.set_device(opt.local_rank) device = torch.device("cuda", opt.local_rank) dist.init_process_group(backend="nccl", init_method="env://") # distributed backend assert (opt.batch_size % opt.world_size == 0 ), "--batch-size must be multiple of CUDA device count" opt.batch_size = opt.total_batch_size // opt.world_size # Hyperparameters with open(opt.hyp) as f: hyp = yaml.load(f, Loader=yaml.SafeLoader) # load hyps # Train logger.info(opt) try: import wandb except ImportError: wandb = None prefix = colorstr("wandb: ") logger.info( f"{prefix}Install Weights & Biases for YOLOv5 logging with 'pip install wandb' (recommended)" ) if not opt.evolve: tb_writer = None # init loggers if opt.global_rank in [-1, 0]: logger.info( f'Start Tensorboard with "tensorboard --logdir {opt.project}", view at http://localhost:6006/' ) tb_writer = SummaryWriter(opt.save_dir) # Tensorboard train(hyp, opt, device, tb_writer, wandb) # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = { "lr0": (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) "lrf": (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) "momentum": (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 "weight_decay": (1, 0.0, 0.001), # optimizer weight decay "warmup_epochs": (1, 0.0, 5.0), # warmup epochs (fractions ok) "warmup_momentum": (1, 0.0, 0.95), # warmup initial momentum "warmup_bias_lr": (1, 0.0, 0.2), # warmup initial bias lr "box": (1, 0.02, 0.2), # box loss gain "cls": (1, 0.2, 4.0), # cls loss gain "cls_pw": (1, 0.5, 2.0), # cls BCELoss positive_weight "obj": (1, 0.2, 4.0), # obj loss gain (scale with pixels) "obj_pw": (1, 0.5, 2.0), # obj BCELoss positive_weight "iou_t": (0, 0.1, 0.7), # IoU training threshold "anchor_t": (1, 2.0, 8.0), # anchor-multiple threshold "anchors": (2, 2.0, 10.0), # anchors per output grid (0 to ignore) "fl_gamma": ( 0, 0.0, 2.0, ), # focal loss gamma (efficientDet default gamma=1.5) "hsv_h": (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) "hsv_s": (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) "hsv_v": (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) "degrees": (1, 0.0, 45.0), # image rotation (+/- deg) "translate": (1, 0.0, 0.9), # image translation (+/- fraction) "scale": (1, 0.0, 0.9), # image scale (+/- gain) "shear": (1, 0.0, 10.0), # image shear (+/- deg) "perspective": ( 0, 0.0, 0.001, ), # image perspective (+/- fraction), range 0-0.001 "flipud": (1, 0.0, 1.0), # image flip up-down (probability) "fliplr": (0, 0.0, 1.0), # image flip left-right (probability) "mosaic": (1, 0.0, 1.0), # image mixup (probability) "mixup": (1, 0.0, 1.0), } # image mixup (probability) assert opt.local_rank == -1, "DDP mode not implemented for --evolve" opt.notest, opt.nosave = True, True # only test/save final epoch # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices yaml_file = Path( opt.save_dir) / "hyp_evolved.yaml" # save best result here if opt.bucket: os.system("gsutil cp gs://%s/evolve.txt ." % opt.bucket) # download evolve.txt if exists for _ in range(300): # generations to evolve if Path("evolve.txt").exists( ): # if evolve.txt exists: select best hyps and mutate # Select parent(s) parent = "single" # parent selection method: 'single' or 'weighted' x = np.loadtxt("evolve.txt", ndmin=2) n = min(5, len(x)) # number of previous results to consider x = x[np.argsort(-fitness(x))][:n] # top n mutations w = fitness(x) - fitness(x).min() # weights if parent == "single" or len(x) == 1: # x = x[random.randint(0, n - 1)] # random selection x = x[random.choices(range(n), weights=w)[0]] # weighted selection elif parent == "weighted": x = (x * w.reshape( n, 1)).sum(0) / w.sum() # weighted combination # Mutate mp, s = 0.8, 0.2 # mutation probability, sigma npr = np.random npr.seed(int(time.time())) g = np.array([x[0] for x in meta.values()]) # gains 0-1 ng = len(meta) v = np.ones(ng) while all( v == 1 ): # mutate until a change occurs (prevent duplicates) v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) hyp[k] = float(x[i + 7] * v[i]) # mutate # Constrain to limits for k, v in meta.items(): hyp[k] = max(hyp[k], v[1]) # lower limit hyp[k] = min(hyp[k], v[2]) # upper limit hyp[k] = round(hyp[k], 5) # significant digits # Train mutation results = train(hyp.copy(), opt, device, wandb=wandb) # Write mutation results print_mutation(hyp.copy(), results, yaml_file, opt.bucket) # Plot results plot_evolution(yaml_file) print( f"Hyperparameter evolution complete. Best results saved as: {yaml_file}\n" f"Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}" )
def main(opt, callbacks=Callbacks()): # Checks if RANK in [-1, 0]: print_args(FILE.stem, opt) check_git_status() check_requirements(exclude=['thop']) # Resume # TODO resume mode # if opt.resume and not check_wandb_resume(opt): # resume an interrupted run # ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path # assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist' # with open(Path(ckpt).parent.parent / 'opt.yaml', errors='ignore') as f: # opt = argparse.Namespace(**yaml.safe_load(f)) # replace # opt.cfg, opt.weights, opt.resume = '', ckpt, True # reinstate # LOGGER.info(f'Resuming training from {ckpt}') if 1: opt.data, opt.cfg, opt.hyp, opt.weights, opt.project = \ check_file(opt.data), check_yaml(opt.cfg), check_yaml(opt.hyp), str(opt.weights), str(opt.project) # checks assert len(opt.cfg) or len( opt.weights), 'either --cfg or --weights must be specified' opt.save_dir = str( increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok)) # DDP mode device = select_device(opt.device, batch_size=opt.batch_size) # if LOCAL_RANK != -1: # assert torch.cuda.device_count() > LOCAL_RANK, 'insufficient CUDA devices for DDP command' # assert opt.batch_size % WORLD_SIZE == 0, '--batch-size must be multiple of CUDA device count' # assert not opt.image_weights, '--image-weights argument is not compatible with DDP training' # assert not opt.evolve, '--evolve argument is not compatible with DDP training' # torch.cuda.set_device(LOCAL_RANK) # device = torch.device('cuda', LOCAL_RANK) # dist.init_process_group(backend="nccl" if dist.is_nccl_available() else "gloo") # Freeze train parent_save_dir = opt.save_dir with open(opt.freeze_plan, errors='freeze_plan cannot find!') as f: freeze_plan = yaml.safe_load(f) for arg in ['evolve', 'resume']: if opt.__getattribute__('resume'): opt.__setattr__(arg, None) LOGGER.info( f'Currently option --{arg} is not support in freeze training mode') LOGGER.info('Starting freeze training!') for i, plan in enumerate(freeze_plan['train']): print(plan, '\n') assert len( plan ) == 2, "ERROR: Please check your freeze plan format! It should be [strategy, epoch]" strategy = plan[0] strategy_epochs = plan[1] for strategy_epoch in range(strategy_epochs): for step in freeze_plan[strategy]: if len(step) == 2: opt.freeze_type = step[0] opt.freeze = '' opt.epochs = step[1] LOGGER.info( f'Currently training strategy epoch {strategy_epoch}, step {i} of strategy {strategy}; training without freeze, \ Epoch is {opt.epochs}.') elif len(step) == 3: opt.freeze_type = step[0] opt.freeze = step[1] opt.epochs = step[2] LOGGER.info( f'Currently training strategy epoch {strategy_epoch}, step {i} of strategy {strategy}; Freeze type is {opt.freeze}, \ Epoch is {opt.epochs}, Freeze layers {opt.freeze}') else: raise Exception( f'ERROR: format of each step in strategy should be [layer select type, epoch] or [layer select type, freeze layer, epoch]' ) opt.save_dir = str( increment_path(Path(parent_save_dir) / opt.name, exist_ok=opt.exist_ok)) train.train(opt.hyp, opt, device, callbacks) if WORLD_SIZE > 1 and RANK == 0: LOGGER.info('Destroying process group... ') dist.destroy_process_group()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--weights', type=str, default='yolov5s.pt', help='initial weights path') parser.add_argument('--cfg', type=str, default='', help='model.yaml path') parser.add_argument('--data', type=str, default='data/coco128.yaml', help='data.yaml path') parser.add_argument('--hyp', type=str, default='data/hyp.scratch.yaml', help='hyperparameters path') parser.add_argument('--epochs', type=int, default=300) parser.add_argument('--batch-size', type=int, default=16, help='total batch size for all GPUs') parser.add_argument('--img-size', nargs='+', type=int, default=[640, 640], help='[train, test] image sizes') parser.add_argument('--rect', action='store_true', help='rectangular training') parser.add_argument('--resume', nargs='?', const=True, default=False, help='resume most recent training') parser.add_argument('--nosave', action='store_true', help='only save final checkpoint') parser.add_argument('--notest', action='store_true', help='only test final epoch') parser.add_argument('--noautoanchor', action='store_true', help='disable autoanchor check') parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters') parser.add_argument('--bucket', type=str, default='', help='gsutil bucket') parser.add_argument('--cache-images', action='store_true', help='cache images for faster training') parser.add_argument('--image-weights', action='store_true', help='use weighted image selection for training') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--multi-scale', action='store_true', help='vary img-size +/- 50%%') parser.add_argument('--single-cls', action='store_true', help='train multi-class data as single-class') parser.add_argument('--adam', action='store_true', help='use torch.optim.Adam() optimizer') parser.add_argument('--sync-bn', action='store_true', help='use SyncBatchNorm, only available in DDP mode') parser.add_argument('--local_rank', type=int, default=-1, help='DDP parameter, do not modify') parser.add_argument('--workers', type=int, default=8, help='maximum number of dataloader workers') parser.add_argument('--project', default='runs/train', help='save to project/name') parser.add_argument('--entity', default=None, help='W&B entity') parser.add_argument('--name', default='exp', help='save to project/name') parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') parser.add_argument('--quad', action='store_true', help='quad dataloader') parser.add_argument('--linear-lr', action='store_true', help='linear LR') parser.add_argument('--label-smoothing', type=float, default=0.0, help='Label smoothing epsilon') parser.add_argument('--upload_dataset', action='store_true', help='Upload dataset as W&B artifact table') parser.add_argument('--bbox_interval', type=int, default=-1, help='Set bounding-box image logging interval for W&B') parser.add_argument('--save_period', type=int, default=-1, help='Log model after every "save_period" epoch') parser.add_argument('--artifact_alias', type=str, default="latest", help='version of dataset artifact to be used') parser.add_argument('--sly', action='store_true', help='for Supervisely App integration') parser.add_argument('--metrics_period', type=int, default=1, help='Log metrics to Supervisely every "metrics_period" epochs') opt = parser.parse_args() print("Input arguments:", opt) # Set DDP variables opt.world_size = int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 opt.global_rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1 set_logging(opt.global_rank) if opt.global_rank in [-1, 0]: check_git_status() #check_requirements() # Resume wandb_run = check_wandb_resume(opt) if opt.resume and not wandb_run: # resume an interrupted run ckpt = opt.resume if isinstance(opt.resume, str) else get_latest_run() # specified or most recent path assert os.path.isfile(ckpt), 'ERROR: --resume checkpoint does not exist' apriori = opt.global_rank, opt.local_rank with open(Path(ckpt).parent.parent / 'opt.yaml') as f: opt = argparse.Namespace(**yaml.safe_load(f)) # replace opt.cfg, opt.weights, opt.resume, opt.batch_size, opt.global_rank, opt.local_rank = \ '', ckpt, True, opt.total_batch_size, *apriori # reinstate logger.info('Resuming training from %s' % ckpt) else: # opt.hyp = opt.hyp or ('hyp.finetune.yaml' if opt.weights else 'hyp.scratch.yaml') opt.data, opt.cfg, opt.hyp = check_file(opt.data), check_file(opt.cfg), check_file(opt.hyp) # check files assert len(opt.cfg) or len(opt.weights), 'either --cfg or --weights must be specified' opt.img_size.extend([opt.img_size[-1]] * (2 - len(opt.img_size))) # extend to 2 sizes (train, test) opt.name = 'evolve' if opt.evolve else opt.name opt.save_dir = str(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok | opt.evolve)) # DDP mode opt.total_batch_size = opt.batch_size device = select_device(opt.device, batch_size=opt.batch_size) if opt.local_rank != -1: assert torch.cuda.device_count() > opt.local_rank torch.cuda.set_device(opt.local_rank) device = torch.device('cuda', opt.local_rank) dist.init_process_group(backend='nccl', init_method='env://') # distributed backend assert opt.batch_size % opt.world_size == 0, '--batch-size must be multiple of CUDA device count' opt.batch_size = opt.total_batch_size // opt.world_size # Hyperparameters with open(opt.hyp) as f: hyp = yaml.safe_load(f) # load hyps # Train logger.info(opt) if not opt.evolve: tb_writer = None # init loggers if opt.global_rank in [-1, 0]: #prefix = colorstr('tensorboard: ') #logger.info(f"{prefix}Start with 'tensorboard --logdir {opt.project}', view at http://localhost:6006/") tb_writer = SummaryWriter(opt.save_dir) # Tensorboard train(hyp, opt, device, tb_writer) # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr 'box': (1, 0.02, 0.2), # box loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight 'obj': (1, 0.2, 4.0), # obj loss gain (scale with pixels) 'obj_pw': (1, 0.5, 2.0), # obj BCELoss positive_weight 'iou_t': (0, 0.1, 0.7), # IoU training threshold 'anchor_t': (1, 2.0, 8.0), # anchor-multiple threshold 'anchors': (2, 2.0, 10.0), # anchors per output grid (0 to ignore) 'fl_gamma': (0, 0.0, 2.0), # focal loss gamma (efficientDet default gamma=1.5) 'hsv_h': (1, 0.0, 0.1), # image HSV-Hue augmentation (fraction) 'hsv_s': (1, 0.0, 0.9), # image HSV-Saturation augmentation (fraction) 'hsv_v': (1, 0.0, 0.9), # image HSV-Value augmentation (fraction) 'degrees': (1, 0.0, 45.0), # image rotation (+/- deg) 'translate': (1, 0.0, 0.9), # image translation (+/- fraction) 'scale': (1, 0.0, 0.9), # image scale (+/- gain) 'shear': (1, 0.0, 10.0), # image shear (+/- deg) 'perspective': (0, 0.0, 0.001), # image perspective (+/- fraction), range 0-0.001 'flipud': (1, 0.0, 1.0), # image flip up-down (probability) 'fliplr': (0, 0.0, 1.0), # image flip left-right (probability) 'mosaic': (1, 0.0, 1.0), # image mixup (probability) 'mixup': (1, 0.0, 1.0)} # image mixup (probability) assert opt.local_rank == -1, 'DDP mode not implemented for --evolve' opt.notest, opt.nosave = True, True # only test/save final epoch # ei = [isinstance(x, (int, float)) for x in hyp.values()] # evolvable indices yaml_file = Path(opt.save_dir) / 'hyp_evolved.yaml' # save best result here if opt.bucket: os.system('gsutil cp gs://%s/evolve.txt .' % opt.bucket) # download evolve.txt if exists for _ in range(300): # generations to evolve if Path('evolve.txt').exists(): # if evolve.txt exists: select best hyps and mutate # Select parent(s) parent = 'single' # parent selection method: 'single' or 'weighted' x = np.loadtxt('evolve.txt', ndmin=2) n = min(5, len(x)) # number of previous results to consider x = x[np.argsort(-fitness(x))][:n] # top n mutations w = fitness(x) - fitness(x).min() # weights if parent == 'single' or len(x) == 1: # x = x[random.randint(0, n - 1)] # random selection x = x[random.choices(range(n), weights=w)[0]] # weighted selection elif parent == 'weighted': x = (x * w.reshape(n, 1)).sum(0) / w.sum() # weighted combination # Mutate mp, s = 0.8, 0.2 # mutation probability, sigma npr = np.random npr.seed(int(time.time())) g = np.array([x[0] for x in meta.values()]) # gains 0-1 ng = len(meta) v = np.ones(ng) while all(v == 1): # mutate until a change occurs (prevent duplicates) v = (g * (npr.random(ng) < mp) * npr.randn(ng) * npr.random() * s + 1).clip(0.3, 3.0) for i, k in enumerate(hyp.keys()): # plt.hist(v.ravel(), 300) hyp[k] = float(x[i + 7] * v[i]) # mutate # Constrain to limits for k, v in meta.items(): hyp[k] = max(hyp[k], v[1]) # lower limit hyp[k] = min(hyp[k], v[2]) # upper limit hyp[k] = round(hyp[k], 5) # significant digits # Train mutation results = train(hyp.copy(), opt, device) # Write mutation results print_mutation(hyp.copy(), results, yaml_file, opt.bucket) # Plot results plot_evolution(yaml_file) print(f'Hyperparameter evolution complete. Best results saved as: {yaml_file}\n' f'Command to train a new model with these hyperparameters: $ python train.py --hyp {yaml_file}')