def _test_broadcast_helper(self, group, group_id, rank, cuda=False, rank_to_GPU=None): for ttype, value, requires_cuda in [ ('torch.FloatTensor', -1e-10, False), ('torch.DoubleTensor', -1e-100, False), ('torch.HalfTensor', -0.1, True), ('torch.CharTensor', -2, False), ('torch.ByteTensor', 129, False), ('torch.IntTensor', -1e5, False), ('torch.LongTensor', -1e15, False), ]: if requires_cuda and not cuda: continue for src in group: expected_tensor = _build_tensor(src + 1, value).type(ttype) if cuda: expected_tensor = expected_tensor.cuda(rank_to_GPU[rank][0]) if rank == src: dist.broadcast(expected_tensor, src, group_id) else: tensor = _build_tensor(src + 1, -1).type(ttype) if cuda: tensor = tensor.cuda(rank_to_GPU[rank][0]) dist.broadcast(tensor, src, group_id) self.assertEqual(tensor.size(), expected_tensor.size()) self.assertEqual(tensor.ne(expected_tensor).max(), 0) self._barrier()
def consistent_indices(self, rank, indices, shuffle): """ synchronize indices among workers. """ if rank == 0 and shuffle: random.shuffle(indices) # broadcast. indices = torch.IntTensor(indices) dist.broadcast(indices, src=0) return list(indices)
def _test_broadcast_helper(self, group, group_id, rank, cuda=False): for src in group: expected_tensor = _build_tensor(src + 1) if cuda: expected_tensor = expected_tensor.cuda() if rank == src: dist.broadcast(expected_tensor, src, group_id) else: tensor = _build_tensor(src + 1, -1) if cuda: tensor = tensor.cuda() dist.broadcast(tensor, src, group_id) self.assertEqual(tensor, expected_tensor) self._barrier()
def _dist_broadcast_coalesced(self, tensors, buffer_size): """ Broadcast a sequence of tensors to the default group from rank 0. Small tensors are first coalesced into a buffer to reduce the number of broadcasts. tensors (sequence): tensors to broadcast. Each tensor needs to be on the same GPU. buffer_size (int): maximum size of the buffer for coalescing """ for tensors in _take_tensors(tensors, buffer_size): flat_tensors = _flatten_dense_tensors(tensors) dist.broadcast(flat_tensors, 0) for tensor, synced in zip(tensors, _unflatten_dense_tensors(flat_tensors, tensors)): tensor.copy_(synced)
def _test_barrier_helper(self, group, group_id, rank): WAIT_TIME = 0.3 # seconds for dest in group: expected_time = torch.DoubleTensor(1).fill_(0.0) if dest == rank: expected_time.fill_(time.time() + WAIT_TIME) dist.broadcast(expected_time, dest, group_id) time.sleep(WAIT_TIME + 0.1) # sleep a little bit longer dist.barrier(group_id) else: dist.broadcast(expected_time, dest, group_id) dist.barrier(group_id) self.assertGreaterEqual(time.time(), expected_time[0]) self._barrier()
def __init__(self, module): super(DistributedDataParallel, self).__init__() self.warn_on_half = True if dist._backend == dist.dist_backend.GLOO else False self.module = module for p in self.module.state_dict().values(): if not torch.is_tensor(p): continue if dist._backend == dist.dist_backend.NCCL: assert p.is_cuda, "NCCL backend only supports model parameters to be on GPU." dist.broadcast(p, 0) def allreduce_params(): if (self.needs_reduction): self.needs_reduction = False buckets = {} for param in self.module.parameters(): if param.requires_grad and param.grad is not None: tp = type(param.data) if tp not in buckets: buckets[tp] = [] buckets[tp].append(param) if self.warn_on_half: if torch.cuda.HalfTensor in buckets: print("WARNING: gloo dist backend for half parameters may be extremely slow." + " It is recommended to use the NCCL backend in this case.") self.warn_on_half = False for tp in buckets: bucket = buckets[tp] grads = [param.grad.data for param in bucket] coalesced = _flatten_dense_tensors(grads) dist.all_reduce(coalesced) coalesced /= dist.get_world_size() for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)): buf.copy_(synced) for param in list(self.module.parameters()): def allreduce_hook(*unused): param._execution_engine.queue_callback(allreduce_params) if param.requires_grad: param.register_hook(allreduce_hook)
def _sync_params(self): params = [p.data for p in self.module.parameters()] result = broadcast_coalesced(params, self.device_ids, self.broadcast_bucket_size) for tensors, module in zip(result[1:], self._module_copies[1:]): for tensor, param in zip(tensors, module.parameters()): param.data.set_(tensor) # cross-node buffer sync buffers = list(self.module._all_buffers()) flat_buffers = _flatten_tensors(buffers) dist.broadcast(flat_buffers, 0) for buf, synced in zip(buffers, _unflatten_tensors(flat_buffers, buffers)): buf.copy_(synced) # intra-node buffer sync result = broadcast_coalesced(buffers, self.device_ids, self.broadcast_bucket_size) for tensors, module in zip(result[1:], self._module_copies[1:]): for tensor, buf in zip(tensors, module._all_buffers()): buf.set_(tensor)
def broadcast_params(model): """ broadcast model parameters """ for p in model.state_dict().values(): dist.broadcast(p, 0)
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass # builtins.print = print_pass if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.arch)) model = pcl.builder.MoCo( models.__dict__[args.arch], args.low_dim, args.pcl_r, args.moco_m, args.temperature, args.mlp) print(model) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) model = nn.SyncBatchNorm.convert_sync_batchnorm(model) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) # comment out the following line for debugging raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. raise NotImplementedError("Only DistributedDataParallel is supported.") # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code train_dataset, eval_dataset = create_cifar10_dataset(args) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) eval_sampler = torch.utils.data.distributed.DistributedSampler(eval_dataset, shuffle=False) else: train_sampler = None eval_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) # dataloader for center-cropped images, use larger batch size to increase speed eval_loader = torch.utils.data.DataLoader( eval_dataset, batch_size=args.batch_size * 5, shuffle=False, sampler=eval_sampler, num_workers=args.workers, pin_memory=True ) for epoch in range(args.start_epoch, args.epochs): cluster_result = None if epoch >= args.warmup_epoch: # compute momentum features for center-cropped images features = compute_features(eval_loader, model, args) # placeholder for clustering result cluster_result = {'im2cluster': [], 'centroids': [], 'density': []} for num_cluster in args.num_cluster: cluster_result['im2cluster'].append(torch.zeros(len(eval_dataset), dtype=torch.long).cuda()) cluster_result['centroids'].append(torch.zeros(int(num_cluster), args.low_dim).cuda()) cluster_result['density'].append(torch.zeros(int(num_cluster)).cuda()) if args.gpu == 0: features[ torch.norm(features, dim=1) > 1.5] /= 2 # account for the few samples that are computed twice features = features.numpy() cluster_result = run_kmeans(features, args) # run kmeans clustering on master node # save the clustering result # torch.save(cluster_result,os.path.join(args.exp_dir, 'clusters_%d'%epoch)) dist.barrier() # broadcast clustering result for k, data_list in cluster_result.items(): for data_tensor in data_list: dist.broadcast(data_tensor, 0, async_op=False) if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args, cluster_result) if (epoch + 1) % 5 == 0 and (not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0)): save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best=False, filename='{}/checkpoint_{:04d}.pth.tar'.format(args.exp_dir, epoch))
MAX_NUM_TENSORS = args.max_num_tensors + 1 MAX_BYTES = args.max_bytes + 1 dist.init_process_group(backend=os.environ['BACKEND']) rank = dist.get_rank() dist.barrier() if rank == 0: print_header("broadcast") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: start = timer() for i in range(0, num_tensors): dist.broadcast(tensor, 0) end = timer() print_stats(bytes, num_tensors, end - start) print() else: for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes) for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]: for i in range(0, num_tensors): dist.broadcast(tensor, 0) dist.barrier() if rank == 0: print_header("send from 0 to 1") for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]: tensor = torch.ByteTensor(bytes).fill_(42)
def train(hyp, opt, device, tb_writer=None): logger.info(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory wdir = log_dir / 'weights' # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data, 'r+', encoding="utf-8") as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round( hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [ ] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [ '', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ 'lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}' ) # save previous weights if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader(test_path, imgsz_test, total_batch_size * 2, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # testloader if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # raise 'dd' # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info('Image sizes %g train, %g test\n' 'Using %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(int(imgsz * 0.7), int(imgsz * 1.3) + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / f'train_batch{ni}.jpg') # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) # if tb_writer and result is not None: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=total_batch_size * 2, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir, plots=epoch == 0 or final_epoch) # plot first and last # Write with open(results_file, 'a') as f: f.write( s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, wdir / 'last{}.pt'.format(epoch)) if best_fitness == fi: torch.save(ckpt, best) del ckpt from utils.general import plot_results plot_results(save_dir=log_dir) # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = opt.name if opt.name.isnumeric() else '' fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer os.system( 'gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def cluster_memory(self): self.start_idx = 0 j = 0 with torch.no_grad(): for i_K, K in enumerate(self.num_clusters): # run distributed k-means # init centroids with elements from memory bank of rank 0 centroids = torch.empty( K, self.embedding_dim).cuda(non_blocking=True) if get_rank() == 0: random_idx = torch.randperm( len(self.local_memory_embeddings[j]))[:K] assert len(random_idx ) >= K, "please reduce the number of centroids" centroids = self.local_memory_embeddings[j][random_idx] dist.broadcast(centroids, 0) for n_iter in range(self.nmb_kmeans_iters + 1): # E step dot_products = torch.mm(self.local_memory_embeddings[j], centroids.t()) _, assignments = dot_products.max(dim=1) # finish if n_iter == self.nmb_kmeans_iters: break # M step where_helper = get_indices_sparse( assignments.cpu().numpy()) counts = torch.zeros(K).cuda(non_blocking=True).int() emb_sums = torch.zeros( K, self.embedding_dim).cuda(non_blocking=True) for k in range(len(where_helper)): if len(where_helper[k][0]) > 0: emb_sums[k] = torch.sum( self.local_memory_embeddings[j][where_helper[k] [0]], dim=0, ) counts[k] = len(where_helper[k][0]) all_reduce_sum(counts) mask = counts > 0 all_reduce_sum(emb_sums) centroids[mask] = emb_sums[mask] / counts[mask].unsqueeze( 1) # normalize centroids centroids = nn.functional.normalize(centroids, dim=1, p=2) getattr(self, "centroids" + str(i_K)).copy_(centroids) # gather the assignments assignments_all = gather_from_all(assignments) indexes_all = gather_from_all(self.local_memory_index) self.assignments[i_K] = -100 self.assignments[i_K][indexes_all] = assignments_all j = (j + 1) % self.nmb_mbs logging.info(f"Rank: {get_rank()}, clustering of the memory bank done")
def train(hyp, tb_writer, opt, device): print(f'Hyperparameters {hyp}') log_dir = tb_writer.log_dir if tb_writer else 'runs/evolution' # run directory wdir = str(Path(log_dir) / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = log_dir + os.sep + 'results.txt' epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.local_rank # TODO: Init DDP logging. Only the first process is allowed to log. # Since I see lots of print here, the logging configuration is skipped here. We may see repeated outputs. # Save run settings with open(Path(log_dir) / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(Path(log_dir) / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Remove previous results if rank in [-1, 0]: for f in glob.glob('*_batch*.jpg') + glob.glob(results_file): os.remove(f) # Create model model = Model(opt.cfg, nc=nc).to(device) # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # Optimizer nbs = 64 # nominal batch size # default DDP implementation is slow for accumulation according to: https://pytorch.org/docs/stable/notes/ddp.html # all-reduce operation is carried out during loss.backward(). # Thus, there would be redundant all-reduce communications in a accumulation procedure, # which means, the result is still right but the training speed gets slower. # TODO: If acceleration is needed, there is an implementation of allreduce_post_accumulation # in https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/BERT/run_pretraining.py accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): if v.requires_grad: if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if hyp['optimizer'] == 'adam': # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Load Model with torch_distributed_zero_first(rank): google_utils.attempt_download(weights) start_epoch, best_fitness = 0, 0.0 if weights.endswith('.pt'): # pytorch format ckpt = torch.load(weights, map_location=device) # load checkpoint # load model try: exclude = ['anchor'] # exclude keys ckpt['model'] = { k: v for k, v in ckpt['model'].float().state_dict().items() if k in model.state_dict() and not any(x in k for x in exclude) and model.state_dict()[k].shape == v.shape } model.load_state_dict(ckpt['model'], strict=False) print('Transferred %g/%g items from %s' % (len(ckpt['model']), len(model.state_dict()), weights)) except KeyError as e: s = "%s is not compatible with %s. This may be due to model differences or %s may be out of date. " \ "Please delete or update %s and try again, or use --weights '' to train from scratch." \ % (weights, opt.cfg, weights, weights) raise KeyError(s) from e # load optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # load results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://arxiv.org/pdf/1812.01187.pdf lf = lambda x: (( (1 + math.cos(x * math.pi / epochs)) / 2)**1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # https://discuss.pytorch.org/t/a-problem-occured-when-resuming-an-optimizer/28822 # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if rank in [-1, 0] and wandb and wandb.run is None: opt.hyp = hyp # add hyperparameters wandb_run = wandb.init( config=opt, resume="allow", project=opt.wandb, name=Path(log_dir).stem, id=ckpt.get('wandb_id') if 'ckpt' in locals() else None) loggers = {'wandb': wandb} # loggers dict # DP mode if device.type != 'cpu' and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and device.type != 'cpu' and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) print('Using SyncBatchNorm()') # Exponential moving average ema = torch_utils.ModelEMA(model) if rank in [-1, 0] else None # DDP mode if device.type != 'cpu' and rank != -1: model = DDP(model, device_ids=[rank], output_device=rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, local_rank=rank, world_size=opt.world_size) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Testloader if rank in [-1, 0]: # local_rank is set to -1. Because only the first process is expected to do evaluation. testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move if rank in [0, -1]: print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) # When in DDP mode, the generated indices will be broadcasted to synchronize dataset. if dataset.image_weights: # Generate indices. if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices( range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast. if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) if rank in [-1, 0]: print( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Backward if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() if ema is not None: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(Path(log_dir) / ('train_batch%g.jpg' % ni)) # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) # if tb_writer and result is not None: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif ni == 10 and wandb: wandb.log( { "Mosaics": [ wandb.Image(str(x), caption=x.name) for x in Path(log_dir).glob('train*.jpg') if x.exists() ] }, commit=False) # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # Only the first process in DDP mode is allowed to log or save checkpoints. if rank in [-1, 0]: # mAP if ema is not None: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, plots=final_epoch, log_imgs=16 if wandb else 0, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write( s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) if wandb: wandb.log({tag: x}, step=epoch, commit=tag == tags[-1]) # W&B # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module if hasattr(ema, 'module') else ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict(), 'wandb_id': wandb_run.id if wandb else None } # Save last, best and delete torch.save(ckpt, last) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png if wandb: files = [ 'results.png', 'confusion_matrix.png', *[f'{x}_curve.png' for x in ('F1', 'PR', 'P', 'R')] ] wandb.log({ "Results": [ wandb.Image(str(Path(log_dir) / f), caption=f) for f in files if (Path(log_dir) / f).exists() ] }) try: print("last:", last) wandb.log_artifact(artifact_or_path=str(last), type='model', name="last") except ValueError: print("last model not found in", last) try: print("flast:", flast) wandb.log_artifact(artifact_or_path=str(flast), type='model', name="flast") except ValueError: print("flast model not found in", flast) try: print("best:", best) wandb.log_artifact(artifact_or_path=str(best), type='model', name="best") except ValueError: print("best model not found in", best) try: print("fbest:", fbest) wandb.log_artifact(artifact_or_path=str(fbest), type='model', name="fbest") except ValueError: print("fbest model not found in", fbest) print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None wandb.run.finish() if wandb and wandb.run else None torch.cuda.empty_cache() return results
def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) save_dir, epochs, batch_size, total_batch_size, weights, rank = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Directories wdir = save_dir / 'weights' wdir.mkdir(parents=True, exist_ok=True) # make dir last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = save_dir / 'results.txt' # Save run settings with open(save_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure plots = not opt.evolve # create plots cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.SafeLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc = 1 if opt.single_cls else int(data_dict['nc']) # number of classes names = ['item'] if opt.single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round(hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = [] state_dict = ckpt['model'] # FP32 #state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=True) # load logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay logger.info(f"Scaled weight_decay = {hyp['weight_decay']}") pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if rank in [-1, 0] and wandb and wandb.run is None: opt.hyp = hyp # add hyperparameters wandb_run = wandb.init(config=opt, resume="allow", project='YOLOv5' if opt.project == 'runs/train' else Path(opt.project).stem, name=save_dir.stem, id=ckpt.get('wandb_id') if 'ckpt' in locals() else None) loggers = {'wandb': wandb} # loggers dict # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: ## Optimizer #if ckpt['optimizer'] is not None: # optimizer.load_state_dict(ckpt['optimizer']) # best_fitness = ckpt['best_fitness'] ## Results #if ckpt.get('training_results') is not None: # with open(results_file, 'w') as file: # file.write(ckpt['training_results']) # write results.txt ## Epochs #start_epoch = ckpt['epoch'] + 1 start_epoch = 0 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % (weights, epochs) if epochs < start_epoch: logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(model.stride.max()) # grid size (max stride) nl = model.model[-1].nl # number of detection layers (used for scaling hyp['obj']) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: ')) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, # testloader hyp=hyp, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers, pad=0.5, prefix=colorstr('val: '))[0] if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, save_dir, loggers) if tb_writer: tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['box'] *= 3. / nl # scale to layers hyp['cls'] *= nc / 80. * 3. / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640) ** 2 * 3. / nl # scale to image size and layers model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) compute_loss = ComputeLoss(model) # init loss class logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n' f'Using {dataloader.num_workers} dataloader workers\n' f'Logging results to {save_dir}\n' f'Starting training for {epochs} epochs...') for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ opt_s = opt.s if opt.sr_cos: mask_period = 2 #opt_s = opt.s * lf(epoch) opt_s = ((((1 + math.cos(epoch * math.pi / epochs)) / 2) ** 1.0) * 0.8 + 0.2) * opt.s if opt.sr and epoch % mask_period == 0 and epoch > 0: maskBN(model, soft=True) model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) if opt.sr_cos: opt_s = np.interp(ni, xi, [0.0, opt.s * lf(epoch)]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss(pred, targets.to(device)) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: if opt.sr: updateBN(opt_s, model) scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # if tb_writer: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard elif plots and ni == 10 and wandb: wandb.log({"Mosaics": [wandb.Image(str(x), caption=x.name) for x in save_dir.glob('train*.jpg') if x.exists()]}) # end batch ------------------------------------------------------------------------------------------------ # end epoch ---------------------------------------------------------------------------------------------------- # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, verbose=nc < 50 and final_epoch, plots=plots and final_epoch, log_imgs=opt.log_imgs if wandb else 0, compute_loss=compute_loss) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Log tags = ['train/box_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/box_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2'] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb: wandb.log({tag: x}) # W&B # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict(), 'wandb_id': wandb_run.id if wandb else None} storage_period = 10 if epoch % storage_period == 0: torch.save(ckpt, os.path.splitext(last)[0]+'_%s'%epoch+os.path.splitext(last)[1]) # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers final = best if best.exists() else last # final model for f in [last, best]: if f.exists(): strip_optimizer(f) # strip optimizers if opt.bucket: os.system(f'gsutil cp {final} gs://{opt.bucket}/weights') # upload # Plots if plots: plot_results(save_dir=save_dir) # save as results.png if wandb: files = ['results.png', 'precision_recall_curve.png', 'confusion_matrix.png'] wandb.log({"Results": [wandb.Image(str(save_dir / f), caption=f) for f in files if (save_dir / f).exists()]}) if opt.log_artifacts: wandb.log_artifact(artifact_or_path=str(final), type='model', name=save_dir.stem) # Test best.pt logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) if opt.data.endswith('coco.yaml') and nc == 80: # if COCO for conf, iou, save_json in ([0.25, 0.45, False], [0.001, 0.65, True]): # speed, mAP tests results, _, _ = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, conf_thres=conf, iou_thres=iou, model=attempt_load(final, device).half(), single_cls=opt.single_cls, dataloader=testloader, save_dir=save_dir, save_json=save_json, plots=False) else: dist.destroy_process_group() wandb.run.finish() if wandb and wandb.run else None torch.cuda.empty_cache() return results
def prune_and_eval(rank, size, orig_fit, acc_constraint, valid, corpus, es, ref_model, num_runs, final_results): _valid = valid gpu_id = GPU_ID total_iterations = es.Tmax / es.popsize individual_iter_count = 0 #ref_model = masked_models[rank] X = torch.Tensor(copy.deepcopy(es.pop)) communicate_size = es.n + 4 # the size of tensors transfer accross computers communicate_tensor = torch.FloatTensor(communicate_size * [0.]) fitness_list = [] itr_best_remain = 0 if rank == 0: # rank 0 is the main process to collect finesses X.share_memory_() #fitness_list = [torch.FloatTensor([0.0,0.1,0.2,0.3]).share_memory_() for i in range(size)] fitness_list = [ torch.FloatTensor(communicate_size * [0.]).share_memory_() for i in range(size) ] if rank >= 1 and rank < size: # split tasks to different GPUs gpu_id = other_GPU_IDs[rank - 1] with cuda.device(gpu_id): while (individual_iter_count < total_iterations): if rank == 0: # master node itr_X = torch.Tensor(es.ask()) # broadcast the fathers X.copy_(itr_X) dist.broadcast(itr_X, 0) else: # recieve fathers from the source process dist.broadcast(X, 0) # apply MP on model x = X.numpy()[rank] ref_model.change_mask(x, apply_MP_on_mask) ref_model.apply_mask() # evaluate pruned network fitness = evaluate_lm(ref_model.masked_model, _valid, corpus, TEST_BATCH_SIZE) communicate_tensor[0] = fitness[0] communicate_tensor[1] = fitness[1] communicate_tensor[2] = rank communicate_tensor[3] = ref_model.get_sparsity() for i in range(x.size): communicate_tensor[i + 4] = X[rank, i] #x[i] # sync fitness if rank == 0: # collect fitness across processes dist.gather(communicate_tensor, gather_list=fitness_list) else: dist.gather(communicate_tensor, dst=0) # judge new solutions if rank == 0: # negatively correlated search in master node fit = [] X_ = [] for i in range(es.popsize): the_fitness = 100 for j in range(len( fitness_list)): # results of fitness evaluation if int(fitness_list[j] [2]) == i: # 0:ppl, 1:acc, 2:rank of individual X_.append(fitness_list[j].numpy()[4:]) if orig_fit[1] - fitness_list[j][ 1] <= acc_constraint: the_fitness = -fitness_list[j][3] else: the_fitness = (orig_fit[1] - fitness_list[j][1] ) / acc_constraint continue fit.append(the_fitness) es.tell(X_, fit) itr_best_remain = min(fit) final_results['result_NCS'].copy_(torch.Tensor(es.result()[0])) individual_iter_count += 1 if rank == 0: # record status logger.scalar_summary( 'ncs_%s_fitness' % num_runs, es.result()[1], num_runs * total_iterations + individual_iter_count) logger.scalar_summary( 'ncs_%s_best_itr_remain' % num_runs, itr_best_remain, num_runs * total_iterations + individual_iter_count) logger.histo_summary( 'ncs_%s_pop' % num_runs, es.result()[0], num_runs * total_iterations + individual_iter_count) logger.histo_summary( 'pop of 1', X_[0], num_runs * total_iterations + individual_iter_count) logger.scalar_summary( 'sp of 1', -fitness_list[0][3], num_runs * total_iterations + individual_iter_count) logger.scalar_summary( 'rank of 1', fitness_list[0][2], num_runs * total_iterations + individual_iter_count) logger.histo_summary( 'pop of 2', X_[1], num_runs * total_iterations + individual_iter_count) logger.scalar_summary( 'sp of 2', -fitness_list[1][3], num_runs * total_iterations + individual_iter_count) logger.scalar_summary( 'rank of 2', fitness_list[1][2], num_runs * total_iterations + individual_iter_count) #logger.histo_summary('pop of 3', X_[2], num_runs*total_iterations + individual_iter_count) #logger.scalar_summary('sp of 3', -fitness_list[2][3], num_runs*total_iterations + individual_iter_count) #logger.scalar_summary('rank of 3', fitness_list[2][2], num_runs*total_iterations + individual_iter_count) ref_model.clear_cache()
def broadcast_params(model): for p in model.state_dict().values(): dist.broadcast(p, 0)
print("Initialised process group") for at in range(20): if rank == 0: if args.test_correctness: torch.randn([ sz, ], out=tensor) rand = np.random.randint(5) ind = np.random.randint(100) tensor[ind] = rand st = time.time() dist.broadcast(tensor=tensor, src=0) if args.test_correctness: tensor.zero_() wait_st = time.time() dist.broadcast(tensor=tensor, src=1) wait_en = time.time() print("Time spent in receive call = ", wait_en - wait_st) if args.test_correctness: assert int(tensor[ind].item()) == rand print(f"Attempt {at}: Data was successfully received...") en = time.time()
def main(local_rank, world_size, init_method='tcp://127.0.0.1:23499'): dist.init_process_group(backend='nccl', init_method=init_method, rank=local_rank, world_size=world_size) cfg.local_rank = local_rank torch.cuda.set_device(local_rank) cfg.rank = dist.get_rank() cfg.world_size = world_size print(cfg.rank, dist.get_world_size()) trainset = MXFaceDataset(root_dir='/root/face_datasets/webface/', local_rank=local_rank) train_sampler = torch.utils.data.distributed.DistributedSampler( trainset, shuffle=True) trainloader = DataLoaderX(local_rank=local_rank, dataset=trainset, batch_size=cfg.batch_size, sampler=train_sampler, num_workers=0, pin_memory=True, drop_last=False) backbone = iresnet50(False).to(cfg.local_rank) backbone.train() # backbone = nn.SyncBatchNorm.convert_sync_batchnorm(backbone) for ps in backbone.parameters(): dist.broadcast(ps, 0) backbone = torch.nn.parallel.DistributedDataParallel( backbone, broadcast_buffers=False, device_ids=[dist.get_rank()]) backbone.train() sub_start, sub_classnum = get_sub_class(cfg.rank, dist.get_world_size()) print(sub_start, sub_classnum) classifier_head = classifier(cfg.embedding_size, sub_classnum, sample_rate=0.4) cosface = CosFace(s=64.0, m=0.4) optimizer = SGD([{ 'params': backbone.parameters() }, { 'params': classifier_head.parameters() }], 0.1, momentum=0.9, weight_decay=cfg.weight_decay, rescale=cfg.world_size) warm_up_with_multistep_lr = lambda epoch: ( (epoch + 1) / (4 + 1))**2 if epoch < -1 else 0.1**len( [m for m in [20, 29] if m - 1 <= epoch]) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=warm_up_with_multistep_lr) n_epochs = 33 start_epoch = 0 if cfg.local_rank == 0: writer = SummaryWriter(log_dir='logs/shows') global_step = 0 loss_fun = nn.CrossEntropyLoss() for epoch in range(start_epoch, n_epochs): train_sampler.set_epoch(epoch) for step, (img, label) in enumerate(trainloader): start = time.time() lable_gather, norm_weight = classifier_head.prepare( label, optimizer) x = F.normalize(backbone(img)) x_gather = torch.zeros(x.size()[0] * cfg.world_size, cfg.embedding_size, device=cfg.local_rank) dist.all_gather(list(x_gather.chunk(cfg.world_size, dim=0)), x.data) x_gather.requires_grad = True logits = classifier_head(x_gather, norm_weight) logits = cosface(logits, lable_gather) with torch.no_grad(): max_v = torch.max(logits, dim=1, keepdim=True)[0] dist.all_reduce(max_v, dist.ReduceOp.MAX) exp = torch.exp(logits - max_v) sum_exp = exp.sum(dim=1, keepdims=True) dist.all_reduce(sum_exp, dist.ReduceOp.SUM) exp.div_(sum_exp.clamp_min(1e-20)) grad = exp index = torch.where(lable_gather != -1)[0] one_hot = torch.zeros(index.size()[0], grad.size()[1], device=grad.device) one_hot.scatter_(1, lable_gather[index, None], 1) loss = torch.zeros(grad.size()[0], 1, device=grad.device) loss[index] = grad[index].gather(1, lable_gather[index, None]) dist.all_reduce(loss, dist.ReduceOp.SUM) loss_v = loss.clamp_min_(1e-20).log_().mean() * (-1) grad[index] -= one_hot grad.div_(grad.size()[0]) logits.backward(grad) if x_gather.grad is not None: x_gather.grad.detach_() x_grad = torch.zeros_like(x) dist.reduce_scatter( x_grad, list(x_gather.grad.chunk(cfg.world_size, dim=0))) x.backward(x_grad) optimizer.step() classifier_head.update() optimizer.zero_grad() if cfg.rank == 0: print(x_gather.grad.max(), x_gather.grad.min()) print('loss_v', loss_v.item(), global_step) writer.add_scalar('loss', loss_v, global_step) print('lr', optimizer.state_dict()['param_groups'][0]['lr'], global_step) print(cfg.batch_size / (time.time() - start)) global_step += 1 scheduler.step() if cfg.rank == 0: torch.save(backbone.module.state_dict(), "models/" + str(epoch) + 'backbone.pth') dist.destroy_process_group()
# Configuration ranks_per_node = 8 shape = 2**17 dtype = torch.float32 # Initialize MPI rank, n_ranks = init_workers_nccl_file() local_rank = rank % ranks_per_node # Allocate a small tensor on every gpu from every rank. # This is an attempt to force creation of all device contexts. #for i in range(ranks_per_node): # _ = torch.randn(1).to(torch.device('cuda', i)) # Select our gpu device = torch.device('cuda', local_rank) print('Rank', rank, 'size', n_ranks, 'device', device, 'count', torch.cuda.device_count()) # Allocate a tensor on the gpu x = torch.randn(shape, dtype=dtype).to(device) print('local result:', x.sum()) # Do a broadcast from rank 0 dist.broadcast(x, 0) print('broadcast result:', x.sum()) # Do an all-reduce dist.all_reduce(x) print('allreduce result:', x.sum())
def main(): args.distributed = True print("~~epoch\thours\ttop1Accuracy\n") start_time = datetime.now() if args.distributed: os.environ['WORLD_SIZE'] = str(args.world_size) dist.init_process_group(backend=args.dist_backend, init_method = args.dist_url, world_size = args.world_size, rank = int(os.environ['RANK'])) torch.cuda.set_device(args.local_rank) if dist.get_rank() == 0: print(str(dist.get_world_size()) + ' number of workers is set up!') if dist.get_rank() == 0: torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) log_writer = tensorboardX.SummaryWriter(args.save_dir) if dist.get_rank() == 0 else None # create model model = models.resnet50() model = model.cuda() #model.para sync global param_copy param_copy = list(model.parameters()) for parameter in param_copy: dist.broadcast(parameter.data, 0) #group = 0 if dist.get_rank() == 0: print('parameter sync finished') # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = Signum_SGD.SGD_distribute(param_copy, args, log_writer) best_prec1 = 0 # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) else: print("=> no checkpoint found at '{}'".format(args.resume)) traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') args.sz = 224 train_loader,val_loader,train_sampler = get_loaders(traindir, valdir, split_data = not args.test_evaluate, seed = args.seed) if args.evaluate: return validate(val_loader, model, criterion, epoch, start_time) for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) if args.distributed: train_sampler.set_epoch(epoch) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UserWarning) train(train_loader, model, criterion, optimizer, epoch, log_writer) if args.prof: break prec1 = validate(val_loader, model, criterion, epoch, start_time, log_writer) if dist.get_rank() == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) '''
def train_net(args, config): # setup logger logger, final_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.TRAIN_IMAGE_SET, split='train') model_prefix = os.path.join(final_output_path, config.MODEL_PREFIX) if args.log_dir is None: args.log_dir = os.path.join(final_output_path, 'tensorboard_logs') # pprint.pprint(args) # logger.info('training args:{}\n'.format(args)) # pprint.pprint(config) # logger.info('training config:{}\n'.format(pprint.pformat(config))) # manually set random seed if config.RNG_SEED > -1: random.seed(a=config.RNG_SEED) np.random.seed(config.RNG_SEED) torch.random.manual_seed(config.RNG_SEED) torch.cuda.manual_seed_all(config.RNG_SEED) torch.backends.cudnn.deterministic = True imgaug.random.seed(config.RNG_SEED) # cudnn torch.backends.cudnn.benchmark = False if args.cudnn_off: torch.backends.cudnn.enabled = False if args.dist: model = eval(config.MODULE)(config) local_rank = int(os.environ.get('LOCAL_RANK') or 0) config.GPUS = str(local_rank) torch.cuda.set_device(local_rank) master_address = os.environ['MASTER_ADDR'] master_port = int(os.environ['MASTER_PORT'] or 23456) world_size = int(os.environ['WORLD_SIZE'] or 1) rank = int(os.environ['RANK'] or 0) if rank == 0: pprint.pprint(args) logger.info('training args:{}\n'.format(args)) pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) if args.slurm: distributed.init_process_group(backend='nccl') else: try: distributed.init_process_group( backend='nccl', init_method='tcp://{}:{}'.format(master_address, master_port), world_size=world_size, rank=rank, group_name='mtorch') except RuntimeError: pass print( f'native distributed, size: {world_size}, rank: {rank}, local rank: {local_rank}' ) torch.cuda.set_device(local_rank) config.GPUS = str(local_rank) model = model.cuda() if not config.TRAIN.FP16: model = DDP(model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) if rank == 0: summary_parameters( model.module if isinstance( model, torch.nn.parallel.DistributedDataParallel) else model, logger) shutil.copy(args.cfg, final_output_path) shutil.copy(inspect.getfile(eval(config.MODULE)), final_output_path) writer = None if args.log_dir is not None: tb_log_dir = os.path.join(args.log_dir, 'rank{}'.format(rank)) if not os.path.exists(tb_log_dir): os.makedirs(tb_log_dir) writer = SummaryWriter(log_dir=tb_log_dir) batch_size = world_size * (sum(config.TRAIN.BATCH_IMAGES) if isinstance(config.TRAIN.BATCH_IMAGES, list) else config.TRAIN.BATCH_IMAGES) if config.TRAIN.GRAD_ACCUMULATE_STEPS > 1: batch_size = batch_size * config.TRAIN.GRAD_ACCUMULATE_STEPS base_lr = config.TRAIN.LR * batch_size optimizer_grouped_parameters = [{ 'params': [p for n, p in model.named_parameters() if _k in n], 'lr': base_lr * _lr_mult } for _k, _lr_mult in config.TRAIN.LR_MULT] optimizer_grouped_parameters.append({ 'params': [ p for n, p in model.named_parameters() if all([_k not in n for _k, _ in config.TRAIN.LR_MULT]) ] }) if config.TRAIN.OPTIMIZER == 'SGD': optimizer = optim.SGD(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WD) elif config.TRAIN.OPTIMIZER == 'Adam': optimizer = optim.Adam(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, weight_decay=config.TRAIN.WD) elif config.TRAIN.OPTIMIZER == 'AdamW': optimizer = AdamW(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, betas=(0.9, 0.999), eps=1e-6, weight_decay=config.TRAIN.WD, correct_bias=True) else: raise ValueError('Not support optimizer {}!'.format( config.TRAIN.OPTIMIZER)) total_gpus = world_size train_loader, train_sampler = make_dataloader(config, mode='train', distributed=True, num_replicas=world_size, rank=rank, expose_sampler=True) val_loader = make_dataloader(config, mode='val', distributed=True, num_replicas=world_size, rank=rank) else: pprint.pprint(args) logger.info('training args:{}\n'.format(args)) pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) #os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS model = eval(config.MODULE)(config) summary_parameters(model, logger) shutil.copy(args.cfg, final_output_path) shutil.copy(inspect.getfile(eval(config.MODULE)), final_output_path) num_gpus = len(config.GPUS.split(',')) # assert num_gpus <= 1 or (not config.TRAIN.FP16), "Not support fp16 with torch.nn.DataParallel. " \ # "Please use amp.parallel.DistributedDataParallel instead." if num_gpus > 1 and config.TRAIN.FP16: logger.warning("Not support fp16 with torch.nn.DataParallel.") config.TRAIN.FP16 = False total_gpus = num_gpus rank = None writer = SummaryWriter( log_dir=args.log_dir) if args.log_dir is not None else None if hasattr(model, 'setup_adapter'): logger.info('Setting up adapter modules!') model.setup_adapter() # model if num_gpus > 1: model = torch.nn.DataParallel( model, device_ids=[int(d) for d in config.GPUS.split(',')]).cuda() else: torch.cuda.set_device(int(config.GPUS)) model.cuda() # loader # train_set = 'train+val' if config.DATASET.TRAIN_WITH_VAL else 'train' train_loader = make_dataloader(config, mode='train', distributed=False) val_loader = make_dataloader(config, mode='val', distributed=False) train_sampler = None batch_size = num_gpus * (sum(config.TRAIN.BATCH_IMAGES) if isinstance( config.TRAIN.BATCH_IMAGES, list) else config.TRAIN.BATCH_IMAGES) if config.TRAIN.GRAD_ACCUMULATE_STEPS > 1: batch_size = batch_size * config.TRAIN.GRAD_ACCUMULATE_STEPS base_lr = config.TRAIN.LR * batch_size optimizer_grouped_parameters = [{ 'params': [p for n, p in model.named_parameters() if _k in n], 'lr': base_lr * _lr_mult } for _k, _lr_mult in config.TRAIN.LR_MULT] optimizer_grouped_parameters.append({ 'params': [ p for n, p in model.named_parameters() if all([_k not in n for _k, _ in config.TRAIN.LR_MULT]) ] }) if config.TRAIN.OPTIMIZER == 'SGD': optimizer = optim.SGD(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WD) elif config.TRAIN.OPTIMIZER == 'Adam': optimizer = optim.Adam(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, weight_decay=config.TRAIN.WD) elif config.TRAIN.OPTIMIZER == 'AdamW': optimizer = AdamW(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, betas=(0.9, 0.999), eps=1e-6, weight_decay=config.TRAIN.WD, correct_bias=True) else: raise ValueError('Not support optimizer {}!'.format( config.TRAIN.OPTIMIZER)) # partial load pretrain state dict if config.NETWORK.PARTIAL_PRETRAIN != "": pretrain_state_dict = torch.load( config.NETWORK.PARTIAL_PRETRAIN, map_location=lambda storage, loc: storage)['state_dict'] prefix_change = [ prefix_change.split('->') for prefix_change in config.NETWORK.PARTIAL_PRETRAIN_PREFIX_CHANGES ] if len(prefix_change) > 0: pretrain_state_dict_parsed = {} for k, v in pretrain_state_dict.items(): no_match = True for pretrain_prefix, new_prefix in prefix_change: if k.startswith(pretrain_prefix): k = new_prefix + k[len(pretrain_prefix):] pretrain_state_dict_parsed[k] = v no_match = False break if no_match: pretrain_state_dict_parsed[k] = v pretrain_state_dict = pretrain_state_dict_parsed smart_partial_load_model_state_dict(model, pretrain_state_dict) # pretrained classifier # if config.NETWORK.CLASSIFIER_PRETRAINED: # print('Initializing classifier weight from pretrained word embeddings...') # answers_word_embed = [] # for k, v in model.state_dict().items(): # if 'word_embeddings.weight' in k: # word_embeddings = v.detach().clone() # break # for answer in train_loader.dataset.answer_vocab: # a_tokens = train_loader.dataset.tokenizer.tokenize(answer) # a_ids = train_loader.dataset.tokenizer.convert_tokens_to_ids(a_tokens) # a_word_embed = (torch.stack([word_embeddings[a_id] for a_id in a_ids], dim=0)).mean(dim=0) # answers_word_embed.append(a_word_embed) # answers_word_embed_tensor = torch.stack(answers_word_embed, dim=0) # for name, module in model.named_modules(): # if name.endswith('final_mlp'): # module[-1].weight.data = answers_word_embed_tensor.to(device=module[-1].weight.data.device) # metrics train_metrics_list = [ cls_metrics.Accuracy(allreduce=args.dist, num_replicas=world_size if args.dist else 1) ] val_metrics_list = [ cls_metrics.Accuracy(allreduce=args.dist, num_replicas=world_size if args.dist else 1), cls_metrics.RocAUC(allreduce=args.dist, num_replicas=world_size if args.dist else 1) ] for output_name, display_name in config.TRAIN.LOSS_LOGGERS: train_metrics_list.append( cls_metrics.LossLogger( output_name, display_name=display_name, allreduce=args.dist, num_replicas=world_size if args.dist else 1)) train_metrics = CompositeEvalMetric() val_metrics = CompositeEvalMetric() for child_metric in train_metrics_list: train_metrics.add(child_metric) for child_metric in val_metrics_list: val_metrics.add(child_metric) # epoch end callbacks epoch_end_callbacks = [] if (rank is None) or (rank == 0): epoch_end_callbacks = [ Checkpoint(model_prefix, config.CHECKPOINT_FREQUENT) ] validation_monitor = ValidationMonitor( do_validation, val_loader, val_metrics, host_metric_name='RocAUC', label_index_in_batch=config.DATASET.LABEL_INDEX_IN_BATCH, model_dir=os.path.dirname(model_prefix)) # optimizer initial lr before for group in optimizer.param_groups: group.setdefault('initial_lr', group['lr']) # resume/auto-resume if rank is None or rank == 0: smart_resume(model, optimizer, validation_monitor, config, model_prefix, logger) if args.dist: begin_epoch = torch.tensor(config.TRAIN.BEGIN_EPOCH).cuda() distributed.broadcast(begin_epoch, src=0) config.TRAIN.BEGIN_EPOCH = begin_epoch.item() # batch end callbacks batch_size = len(config.GPUS.split(',')) * config.TRAIN.BATCH_IMAGES batch_end_callbacks = [ Speedometer(batch_size, config.LOG_FREQUENT, batches_per_epoch=len(train_loader), epochs=config.TRAIN.END_EPOCH - config.TRAIN.BEGIN_EPOCH) ] # setup lr step and lr scheduler if config.TRAIN.LR_SCHEDULE == 'plateau': print("Warning: not support resuming on plateau lr schedule!") lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='max', factor=config.TRAIN.LR_FACTOR, patience=1, verbose=True, threshold=1e-4, threshold_mode='rel', cooldown=2, min_lr=0, eps=1e-8) elif config.TRAIN.LR_SCHEDULE == 'triangle': lr_scheduler = WarmupLinearSchedule( optimizer, config.TRAIN.WARMUP_STEPS if config.TRAIN.WARMUP else 0, t_total=int(config.TRAIN.END_EPOCH * len(train_loader) / config.TRAIN.GRAD_ACCUMULATE_STEPS), last_epoch=int(config.TRAIN.BEGIN_EPOCH * len(train_loader) / config.TRAIN.GRAD_ACCUMULATE_STEPS) - 1) elif config.TRAIN.LR_SCHEDULE == 'step': lr_iters = [ int(epoch * len(train_loader) / config.TRAIN.GRAD_ACCUMULATE_STEPS) for epoch in config.TRAIN.LR_STEP ] lr_scheduler = WarmupMultiStepLR( optimizer, milestones=lr_iters, gamma=config.TRAIN.LR_FACTOR, warmup_factor=config.TRAIN.WARMUP_FACTOR, warmup_iters=config.TRAIN.WARMUP_STEPS if config.TRAIN.WARMUP else 0, warmup_method=config.TRAIN.WARMUP_METHOD, last_epoch=int(config.TRAIN.BEGIN_EPOCH * len(train_loader) / config.TRAIN.GRAD_ACCUMULATE_STEPS) - 1) else: raise ValueError("Not support lr schedule: {}.".format( config.TRAIN.LR_SCHEDULE)) if config.TRAIN.SWA: assert config.TRAIN.SWA_START_EPOCH < config.TRAIN.END_EPOCH if not config.TRAIN.DEBUG: true_epoch_step = len( train_loader) / config.TRAIN.GRAD_ACCUMULATE_STEPS else: true_epoch_step = 50 step_per_cycle = config.TRAIN.SWA_EPOCH_PER_CYCLE * true_epoch_step # swa_scheduler = torch.optim.lr_scheduler.CyclicLR( # optimizer, # base_lr=config.TRAIN.SWA_MIN_LR * batch_size, # max_lr=config.TRAIN.SWA_MAX_LR * batch_size, # cycle_momentum=False, # step_size_up=10, # step_size_down=step_per_cycle - 10) anneal_steps = max( 1, (config.TRAIN.END_EPOCH - config.TRAIN.SWA_START_EPOCH) // 4) * step_per_cycle anneal_steps = int(anneal_steps) swa_scheduler = SWALR(optimizer, anneal_epochs=anneal_steps, anneal_strategy='linear', swa_lr=config.TRAIN.SWA_MAX_LR * batch_size) else: swa_scheduler = None if config.TRAIN.ROC_STAR: assert config.TRAIN.ROC_START_EPOCH < config.TRAIN.END_EPOCH roc_star = RocStarLoss( delta=2.0, sample_size=config.TRAIN.ROC_SAMPLE_SIZE, sample_size_gamma=config.TRAIN.ROC_SAMPLE_SIZE * 2, update_gamma_each=config.TRAIN.ROC_SAMPLE_SIZE, ) else: roc_star = None # broadcast parameter and optimizer state from rank 0 before training start if args.dist: for v in model.state_dict().values(): distributed.broadcast(v, src=0) # for v in optimizer.state_dict().values(): # distributed.broadcast(v, src=0) best_epoch = torch.tensor(validation_monitor.best_epoch).cuda() best_val = torch.tensor(validation_monitor.best_val).cuda() distributed.broadcast(best_epoch, src=0) distributed.broadcast(best_val, src=0) validation_monitor.best_epoch = best_epoch.item() validation_monitor.best_val = best_val.item() # apex: amp fp16 mixed-precision training if config.TRAIN.FP16: # model.apply(bn_fp16_half_eval) model, optimizer = amp.initialize( model, optimizer, opt_level='O2', keep_batchnorm_fp32=False, loss_scale=config.TRAIN.FP16_LOSS_SCALE, min_loss_scale=32.0) if args.dist: model = Apex_DDP(model, delay_allreduce=True) # NOTE: final_model == model if not using SWA, else final_model == AveragedModel(model) final_model = train( model, optimizer, lr_scheduler, train_loader, train_sampler, train_metrics, config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH, logger, fp16=config.TRAIN.FP16, rank=rank, writer=writer, batch_end_callbacks=batch_end_callbacks, epoch_end_callbacks=epoch_end_callbacks, validation_monitor=validation_monitor, clip_grad_norm=config.TRAIN.CLIP_GRAD_NORM, gradient_accumulate_steps=config.TRAIN.GRAD_ACCUMULATE_STEPS, ckpt_path=config.TRAIN.CKPT_PATH, swa_scheduler=swa_scheduler, swa_start_epoch=config.TRAIN.SWA_START_EPOCH, swa_cycle_epoch=config.TRAIN.SWA_EPOCH_PER_CYCLE, swa_use_scheduler=config.TRAIN.SWA_SCHEDULE, roc_star=roc_star, roc_star_start_epoch=config.TRAIN.ROC_START_EPOCH, roc_interleave=config.TRAIN.ROC_INTERLEAVE, debug=config.TRAIN.DEBUG, ) return rank, final_model
def _do_broadcast(self, tensor: torch.Tensor, src: int) -> torch.Tensor: dist.broadcast(tensor, src=src) return tensor
def sync_parameters(self): for param in self.module.parameters(): dist.broadcast(param.data, 0)
def weight_broadcast(self): for param in self.module.parameters(): dist.broadcast(param.data, 0)
def train(hyp, opt, device, tb_writer=None): print(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path(opt.logdir) / 'evolve' # logging directory wdir = str(log_dir / 'weights') + os.sep # weights directory os.makedirs(wdir, exist_ok=True) last = wdir + 'last.pt' best = wdir + 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # TODO: Use DDP logging. Only the first process is allowed to log. # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # model dict train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int(data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint model = Darknet(opt.cfg).to(device) # create state_dict = {k: v for k, v in ckpt['model'].items() if model.state_dict()[k].numel() == v.numel()} model.load_state_dict(state_dict, strict=False) print('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Darknet(opt.cfg).to(device) # create # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2.append(v) # biases elif 'Conv2d.weight' in k: pg1.append(v) # apply weight_decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) print('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: (((1 + math.cos(x * math.pi / epochs)) / 2) ** 1.0) * 0.8 + 0.2 # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if epochs < start_epoch: print('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = 32 # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) print('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=(opt.local_rank)) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, local_rank=rank, world_size=opt.world_size) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, opt.data, nc - 1) # Testloader if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates *** # local_rank is set to -1. Because only the first process is expected to do evaluation. testloader = create_dataloader(test_path, imgsz_test, batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, local_rank=-1, world_size=opt.world_size)[0] # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: tb_writer.add_histogram('classes', c, 0) # Check anchors #if not opt.noautoanchor: # check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) if rank in [0, -1]: print('Image sizes %g train, %g test' % (imgsz, imgsz_test)) print('Using %g dataloader workers' % dataloader.num_workers) print('Starting training for %g epochs...' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: # Generate indices if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * (1 - maps) ** 2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.from_tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) if rank in [-1, 0]: print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size')) pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Autocast with amp.autocast(enabled=cuda): # Forward pred = model(imgs) # Loss loss, loss_items = compute_loss(pred, targets.to(device), model) # scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # if not torch.isfinite(loss): # print('WARNING: non-finite loss, ending training ', loss_items) # return results # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema is not None: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema is not None: ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test(opt.data, batch_size=batch_size, imgsz=imgsz_test, save_json=final_epoch and opt.data.endswith(os.sep + 'coco.yaml'), model=ema.ema.module if hasattr(ema.ema, 'module') else ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = ['train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss'] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = {'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module.state_dict() if hasattr(ema, 'module') else ema.ema.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict()} # Save last, best and delete torch.save(ckpt, last) if epoch >= (epochs-5): torch.save(ckpt, last.replace('.pt','_{:03d}.pt'.format(epoch))) if (best_fitness == fi) and not final_epoch: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket and ispt else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def broadcast(tensor, src): return dist.broadcast(tensor, src=src)
def run(rank, size): local_train_length = 3000 local_test_length = 333 train_indices = torch.zeros([3, local_train_length], dtype=torch.long) test_indices = torch.zeros([3, local_test_length], dtype=torch.long) local_data_path = '/home/cream/Desktop/arafin_experiments/SOCC/FL-SNN/data/' save_path = os.getcwd() + r'/results' datasets = {'mnist_dvs_10': r'mnist_dvs_25ms_26pxl_10_digits.hdf5'} dataset = local_data_path + datasets['mnist_dvs_10'] input_train = torch.FloatTensor( tables.open_file(dataset).root.train.data[:]) output_train = torch.FloatTensor( tables.open_file(dataset).root.train.label[:]) input_test = torch.FloatTensor(tables.open_file(dataset).root.test.data[:]) output_test = torch.FloatTensor( tables.open_file(dataset).root.test.label[:]) ### Network parameters n_input_neurons = input_train.shape[1] n_output_neurons = output_train.shape[1] n_hidden_neurons = 4 epochs = local_train_length epochs_test = local_test_length learning_rate = 0.005 / n_hidden_neurons kappa = 0.2 alpha = 1 deltas = 1 num_ite = 1 r = 0.3 weights_magnitude = 0.05 task = 'supervised' mode = 'train', tau_ff = 10 tau_fb = 10 tau = 10 mu = 1.5, n_basis_feedforward = 8 feedforward_filter = filters.raised_cosine_pillow_08 feedback_filter = filters.raised_cosine_pillow_08 n_basis_feedback = 1 topology = torch.ones([ n_hidden_neurons + n_output_neurons, n_input_neurons + n_hidden_neurons + n_output_neurons ], dtype=torch.float) topology[[i for i in range(n_output_neurons + n_hidden_neurons)], [ i + n_input_neurons for i in range(n_output_neurons + n_hidden_neurons) ]] = 0 assert torch.sum(topology[:, :n_input_neurons]) == ( n_input_neurons * (n_hidden_neurons + n_output_neurons)) print(topology[:, n_input_neurons:]) # Create the network network = SNNetwork(**utils.training_utils.make_network_parameters( n_input_neurons, n_output_neurons, n_hidden_neurons, topology_type='fully_connected')) # At the beginning, the master node: # - transmits its weights to the workers # - distributes the samples among workers if rank == 0: # Initializing an aggregation list for future weights collection weights_list = [ [ torch.zeros(network.feedforward_weights.shape, dtype=torch.float) for _ in range(size) ], [ torch.zeros(network.feedback_weights.shape, dtype=torch.float) for _ in range(size) ], [ torch.zeros(network.bias.shape, dtype=torch.float) for _ in range(size) ], [torch.zeros(1, dtype=torch.float) for _ in range(size)] ] else: weights_list = [] if rank == 0: train_indicess = torch.tensor(np.random.choice(np.arange( input_train.shape[0]), [3, local_train_length], replace=False), dtype=torch.long) test_indicess = torch.tensor(np.random.choice(np.arange( input_test.shape[0]), [3, local_test_length], replace=False), dtype=torch.long) dist.send(tensor=train_indicess, dst=1) dist.send(tensor=train_indicess, dst=2) dist.send(tensor=train_indicess, dst=3) else: dist.recv(tensor=train_indices, src=0) dist.barrier() if rank == 0: dist.send(tensor=test_indicess, dst=1) dist.send(tensor=test_indicess, dst=2) dist.send(tensor=test_indicess, dst=3) else: dist.recv(tensor=test_indices, src=0) dist.barrier() if rank != 0: training_data = input_train[train_indices[rank - 1, :]] training_label = output_train[train_indices[rank - 1, :]] test_data = input_test[test_indices[rank - 1, :]] test_label = output_test[test_indices[rank - 1, :]] indices = np.random.choice(np.arange(training_data.shape[0]), [training_data.shape[0]], replace=True) S_prime = training_data.shape[-1] S = epochs * S_prime print("S is", S) dist.barrier() group = dist.group.WORLD # Master node sends its weights for parameter in network.get_parameters(): dist.broadcast(network.get_parameters()[parameter], 0) if rank == 0: print( 'Node 0 has shared its model and training data is partitioned among workers' ) # The nodes initialize their eligibility trace and learning signal eligibility_trace = {'ff_weights': 0, 'fb_weights': 0, 'bias': 0} et_temp = {'ff_weights': 0, 'fb_weights': 0, 'bias': 0} learning_signal = 0 ls_temp = 0 dist.barrier() num_ite = 1 test_accs = [] if rank != 0: test_indx = np.random.choice(np.arange(test_data.shape[0]), [test_data.shape[0]], replace=False) np.random.shuffle(test_indx) _, loss = get_acc_and_loss(network, test_data[test_indx], test_label[test_indx]) network.set_mode('train') local_training_sequence = torch.cat((training_data, training_label), dim=1) dist.barrier() ### First local step for i in range(num_ite): for s in range(deltas): if rank != 0: # Feedforward sampling step log_proba, learning_signal, eligibility_trace \ = feedforward_sampling(network, local_training_sequence[indices[0]], eligibility_trace, learning_signal, s, S_prime, alpha, r) if rank != 0: # First local update for parameter in eligibility_trace: eligibility_trace[parameter][ network.hidden_neurons - network.n_non_learnable_neurons] *= learning_signal network.get_parameters( )[parameter] += eligibility_trace[parameter] * learning_rate # First global update if (s + 1) % (tau * deltas) == 0: dist.barrier() global_update(group, rank, network, weights_list) dist.barrier() S = input_train.shape[-1] * local_train_length ### Remainder of the steps for s in range(deltas, S): print(s) if rank != 0: if s % S_prime == 0: # Reset internal state for each example network.reset_internal_state() # lr decay if (s % S / 5 == 0) & (learning_rate > 0.005): learning_rate /= 2 # Feedforward sampling log_proba, ls_temp, et_temp \ = feedforward_sampling(network, local_training_sequence[indices[0]], et_temp, ls_temp, s, S_prime, alpha, r) # Local feedback and global update learning_signal, ls_temp, eligibility_trace, et_temp \ = local_feedback_and_update(network, eligibility_trace, learning_signal, et_temp, ls_temp, learning_rate, kappa, s, deltas) ## Every few timesteps, record test losses if (s + 1) % 40 == 0: _, loss = get_acc_and_loss(network, test_data[test_indx], test_label[test_indx]) network.set_mode('train') # Global update if (s + 1) % (tau * deltas) == 0: dist.barrier() global_update(group, rank, network, weights_list) dist.barrier() if rank == 0: global_test_indices = np.random.choice(np.arange( input_test.shape[0]), [epochs_test], replace=False) np.random.shuffle(global_test_indices) print(global_test_indices) global_acc, _ = get_acc_and_loss(network, input_test[global_test_indices], output_test[global_test_indices]) print('Final global test accuracy: %f' % global_acc)
def train_net(args, config): # setup logger logger, final_output_path = create_logger(config.OUTPUT_PATH, args.cfg, config.DATASET.IMAGE_SET, split='train') model_prefix = os.path.join(final_output_path, config.MODEL_PREFIX) if args.log_dir is None: args.log_dir = os.path.join(final_output_path, 'tensorboard_logs') pprint.pprint(args) logger.info('training args:{}\n'.format(args)) pprint.pprint(config) logger.info('training config:{}\n'.format(pprint.pformat(config))) # manually set random seed if config.RNG_SEED > -1: np.random.seed(config.RNG_SEED) torch.random.manual_seed(config.RNG_SEED) torch.cuda.manual_seed_all(config.RNG_SEED) # cudnn torch.backends.cudnn.benchmark = False if args.cudnn_off: torch.backends.cudnn.enabled = False if args.dist: model = eval(config.MODULE)(config) local_rank = int(os.environ.get('LOCAL_RANK') or 0) os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS torch.cuda.set_device(local_rank) master_address = os.environ['MASTER_ADDR'] master_port = int(os.environ['MASTER_PORT'] or 23456) world_size = int(os.environ['WORLD_SIZE'] or 1) rank = int(os.environ['RANK'] or 0) if args.slurm: distributed.init_process_group(backend='nccl') else: distributed.init_process_group(backend='nccl', init_method='tcp://{}:{}'.format( master_address, master_port), world_size=world_size, rank=rank, group_name='mtorch') print( f'native distributed, size: {world_size}, rank: {rank}, local rank: {local_rank}' ) torch.cuda.set_device(local_rank) config.GPUS = str(local_rank) model = model.cuda() if not config.TRAIN.FP16: model = DDP(model, device_ids=[local_rank], output_device=local_rank) if rank == 0: summary_parameters( model.module if isinstance( model, torch.nn.parallel.DistributedDataParallel) else model, logger) shutil.copy(args.cfg, final_output_path) shutil.copy(inspect.getfile(eval(config.MODULE)), final_output_path) writer = None if args.log_dir is not None: tb_log_dir = os.path.join(args.log_dir, 'rank{}'.format(rank)) if not os.path.exists(tb_log_dir): os.makedirs(tb_log_dir) writer = SummaryWriter(log_dir=tb_log_dir) train_loader, train_sampler = make_dataloader(config, mode='train', distributed=True, num_replicas=world_size, rank=rank, expose_sampler=True) val_loader = make_dataloader(config, mode='val', distributed=True, num_replicas=world_size, rank=rank) batch_size = world_size * (sum(config.TRAIN.BATCH_IMAGES) if isinstance(config.TRAIN.BATCH_IMAGES, list) else config.TRAIN.BATCH_IMAGES) if config.TRAIN.GRAD_ACCUMULATE_STEPS > 1: batch_size = batch_size * config.TRAIN.GRAD_ACCUMULATE_STEPS base_lr = config.TRAIN.LR * batch_size optimizer_grouped_parameters = [{ 'params': [p for n, p in model.named_parameters() if _k in n], 'lr': base_lr * _lr_mult } for _k, _lr_mult in config.TRAIN.LR_MULT] optimizer_grouped_parameters.append({ 'params': [ p for n, p in model.named_parameters() if all([_k not in n for _k, _ in config.TRAIN.LR_MULT]) ] }) if config.TRAIN.OPTIMIZER == 'SGD': optimizer = optim.SGD(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WD) elif config.TRAIN.OPTIMIZER == 'Adam': optimizer = optim.Adam(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, weight_decay=config.TRAIN.WD) elif config.TRAIN.OPTIMIZER == 'AdamW': optimizer = AdamW(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, betas=(0.9, 0.999), eps=1e-6, weight_decay=config.TRAIN.WD, correct_bias=True) else: raise ValueError('Not support optimizer {}!'.format( config.TRAIN.OPTIMIZER)) total_gpus = world_size else: #os.environ['CUDA_VISIBLE_DEVICES'] = config.GPUS model = eval(config.MODULE)(config) summary_parameters(model, logger) shutil.copy(args.cfg, final_output_path) shutil.copy(inspect.getfile(eval(config.MODULE)), final_output_path) num_gpus = len(config.GPUS.split(',')) assert num_gpus <= 1 or (not config.TRAIN.FP16), "Not support fp16 with torch.nn.DataParallel. " \ "Please use amp.parallel.DistributedDataParallel instead." total_gpus = num_gpus rank = None writer = SummaryWriter( log_dir=args.log_dir) if args.log_dir is not None else None # model if num_gpus > 1: model = torch.nn.DataParallel( model, device_ids=[int(d) for d in config.GPUS.split(',')]).cuda() else: torch.cuda.set_device(int(config.GPUS)) model.cuda() # loader train_loader = make_dataloader(config, mode='train', distributed=False) val_loader = make_dataloader(config, mode='val', distributed=False) train_sampler = None batch_size = num_gpus * (sum(config.TRAIN.BATCH_IMAGES) if isinstance( config.TRAIN.BATCH_IMAGES, list) else config.TRAIN.BATCH_IMAGES) if config.TRAIN.GRAD_ACCUMULATE_STEPS > 1: batch_size = batch_size * config.TRAIN.GRAD_ACCUMULATE_STEPS base_lr = config.TRAIN.LR * batch_size optimizer_grouped_parameters = [{ 'params': [p for n, p in model.named_parameters() if _k in n], 'lr': base_lr * _lr_mult } for _k, _lr_mult in config.TRAIN.LR_MULT] optimizer_grouped_parameters.append({ 'params': [ p for n, p in model.named_parameters() if all([_k not in n for _k, _ in config.TRAIN.LR_MULT]) ] }) if config.TRAIN.OPTIMIZER == 'SGD': optimizer = optim.SGD(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, momentum=config.TRAIN.MOMENTUM, weight_decay=config.TRAIN.WD) elif config.TRAIN.OPTIMIZER == 'Adam': optimizer = optim.Adam(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, weight_decay=config.TRAIN.WD) elif config.TRAIN.OPTIMIZER == 'AdamW': optimizer = AdamW(optimizer_grouped_parameters, lr=config.TRAIN.LR * batch_size, betas=(0.9, 0.999), eps=1e-6, weight_decay=config.TRAIN.WD, correct_bias=True) else: raise ValueError('Not support optimizer {}!'.format( config.TRAIN.OPTIMIZER)) # partial load pretrain state dict if config.NETWORK.PARTIAL_PRETRAIN != "": pretrain_state_dict = torch.load( config.NETWORK.PARTIAL_PRETRAIN, map_location=lambda storage, loc: storage)['state_dict'] prefix_change = [ prefix_change.split('->') for prefix_change in config.NETWORK.PARTIAL_PRETRAIN_PREFIX_CHANGES ] pretrain_state_dict_parsed = {} for k, v in pretrain_state_dict.items(): no_match = True for pretrain_prefix, new_prefix in prefix_change: if k.startswith(pretrain_prefix): k = new_prefix + k[len(pretrain_prefix):] pretrain_state_dict_parsed[k] = v no_match = False break if no_match: pretrain_state_dict_parsed[k] = v if 'module.vlbert.relationsip_head.caption_image_relationship.weight' in pretrain_state_dict \ and config.NETWORK.LOAD_REL_HEAD: pretrain_state_dict_parsed['module.final_mlp.1.weight'] \ = pretrain_state_dict['module.vlbert.relationsip_head.caption_image_relationship.weight'][1:2].float() \ - pretrain_state_dict['module.vlbert.relationsip_head.caption_image_relationship.weight'][0:1].float() pretrain_state_dict_parsed['module.final_mlp.1.bias'] \ = pretrain_state_dict['module.vlbert.relationsip_head.caption_image_relationship.bias'][1:2].float() \ - pretrain_state_dict['module.vlbert.relationsip_head.caption_image_relationship.bias'][0:1].float() if config.NETWORK.PARTIAL_PRETRAIN_SEGMB_INIT: if isinstance( pretrain_state_dict_parsed[ 'module.vlbert._module.token_type_embeddings.weight'], torch.HalfTensor): pretrain_state_dict_parsed['module.vlbert._module.token_type_embeddings.weight'] = \ pretrain_state_dict_parsed['module.vlbert._module.token_type_embeddings.weight'].float() pretrain_state_dict_parsed['module.vlbert._module.token_type_embeddings.weight'][1] = \ pretrain_state_dict_parsed['module.vlbert._module.token_type_embeddings.weight'][0] pretrain_state_dict = pretrain_state_dict_parsed smart_partial_load_model_state_dict(model, pretrain_state_dict) # metrics train_metrics_list = [ snlive_metrics.Accuracy(allreduce=args.dist, num_replicas=world_size if args.dist else 1) ] val_metrics_list = [ snlive_metrics.Accuracy(allreduce=args.dist, num_replicas=world_size if args.dist else 1) ] for output_name, display_name in config.TRAIN.LOSS_LOGGERS: train_metrics_list.append( snlive_metrics.LossLogger( output_name, display_name=display_name, allreduce=args.dist, num_replicas=world_size if args.dist else 1)) train_metrics = CompositeEvalMetric() val_metrics = CompositeEvalMetric() for child_metric in train_metrics_list: train_metrics.add(child_metric) for child_metric in val_metrics_list: val_metrics.add(child_metric) # epoch end callbacks epoch_end_callbacks = [] if (rank is None) or (rank == 0): epoch_end_callbacks = [ Checkpoint(model_prefix, config.CHECKPOINT_FREQUENT) ] validation_monitor = ValidationMonitor( do_validation, val_loader, val_metrics, host_metric_name='Acc', label_index_in_batch=config.DATASET.LABEL_INDEX_IN_BATCH) # optimizer initial lr before for group in optimizer.param_groups: group.setdefault('initial_lr', group['lr']) # resume/auto-resume if rank is None or rank == 0: smart_resume(model, optimizer, validation_monitor, config, model_prefix, logger) if args.dist: begin_epoch = torch.tensor(config.TRAIN.BEGIN_EPOCH).cuda() distributed.broadcast(begin_epoch, src=0) config.TRAIN.BEGIN_EPOCH = begin_epoch.item() # batch end callbacks batch_size = len(config.GPUS.split(',')) * config.TRAIN.BATCH_IMAGES batch_end_callbacks = [ Speedometer(batch_size, config.LOG_FREQUENT, batches_per_epoch=len(train_loader), epochs=config.TRAIN.END_EPOCH - config.TRAIN.BEGIN_EPOCH) ] # setup lr step and lr scheduler if config.TRAIN.LR_SCHEDULE == 'plateau': print("Warning: not support resuming on plateau lr schedule!") lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='max', factor=config.TRAIN.LR_FACTOR, patience=1, verbose=True, threshold=1e-4, threshold_mode='rel', cooldown=2, min_lr=0, eps=1e-8) elif config.TRAIN.LR_SCHEDULE == 'triangle': lr_scheduler = WarmupLinearSchedule( optimizer, config.TRAIN.WARMUP_STEPS if config.TRAIN.WARMUP else 0, t_total=int(config.TRAIN.END_EPOCH * len(train_loader) / config.TRAIN.GRAD_ACCUMULATE_STEPS), last_epoch=int(config.TRAIN.BEGIN_EPOCH * len(train_loader) / config.TRAIN.GRAD_ACCUMULATE_STEPS) - 1) elif config.TRAIN.LR_SCHEDULE == 'step': lr_iters = [ int(epoch * len(train_loader) / config.TRAIN.GRAD_ACCUMULATE_STEPS) for epoch in config.TRAIN.LR_STEP ] lr_scheduler = WarmupMultiStepLR( optimizer, milestones=lr_iters, gamma=config.TRAIN.LR_FACTOR, warmup_factor=config.TRAIN.WARMUP_FACTOR, warmup_iters=config.TRAIN.WARMUP_STEPS if config.TRAIN.WARMUP else 0, warmup_method=config.TRAIN.WARMUP_METHOD, last_epoch=int(config.TRAIN.BEGIN_EPOCH * len(train_loader) / config.TRAIN.GRAD_ACCUMULATE_STEPS) - 1) else: raise ValueError("Not support lr schedule: {}.".format( config.TRAIN.LR_SCHEDULE)) # broadcast parameter and optimizer state from rank 0 before training start if args.dist: for v in model.state_dict().values(): distributed.broadcast(v, src=0) # for v in optimizer.state_dict().values(): # distributed.broadcast(v, src=0) best_epoch = torch.tensor(validation_monitor.best_epoch).cuda() best_val = torch.tensor(validation_monitor.best_val).cuda() distributed.broadcast(best_epoch, src=0) distributed.broadcast(best_val, src=0) validation_monitor.best_epoch = best_epoch.item() validation_monitor.best_val = best_val.item() # apex: amp fp16 mixed-precision training if config.TRAIN.FP16: # model.apply(bn_fp16_half_eval) model, optimizer = amp.initialize( model, optimizer, opt_level='O2', keep_batchnorm_fp32=False, loss_scale=config.TRAIN.FP16_LOSS_SCALE, min_loss_scale=128.0) if args.dist: model = Apex_DDP(model, delay_allreduce=True) train(model, optimizer, lr_scheduler, train_loader, train_sampler, train_metrics, config.TRAIN.BEGIN_EPOCH, config.TRAIN.END_EPOCH, logger, rank=rank, batch_end_callbacks=batch_end_callbacks, epoch_end_callbacks=epoch_end_callbacks, writer=writer, validation_monitor=validation_monitor, fp16=config.TRAIN.FP16, clip_grad_norm=config.TRAIN.CLIP_GRAD_NORM, gradient_accumulate_steps=config.TRAIN.GRAD_ACCUMULATE_STEPS) return rank, model
def main(args): init_process_group(backend='nccl') with open(args.config) as file: config = apply_dict(Dict, json.load(file)) config.update(vars(args)) config.update( dict(world_size=distributed.get_world_size(), global_rank=distributed.get_rank(), device_count=cuda.device_count(), local_rank=distributed.get_rank() % cuda.device_count())) print(f'config: {config}') backends.cudnn.benchmark = True backends.cudnn.fastest = True np.random.seed(config.seed) torch.manual_seed(config.seed) cuda.manual_seed(config.seed) cuda.set_device(config.local_rank) train_dataset = ImageNet(root=config.train_root, meta=config.train_meta, transform=transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ])) val_dataset = ImageNet(root=config.val_root, meta=config.val_meta, transform=transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), ])) train_sampler = utils.data.distributed.DistributedSampler(train_dataset) val_sampler = utils.data.distributed.DistributedSampler(val_dataset) train_data_loader = utils.data.DataLoader( dataset=train_dataset, batch_size=config.local_batch_size, sampler=train_sampler, num_workers=config.num_workers, pin_memory=True) val_data_loader = utils.data.DataLoader(dataset=val_dataset, batch_size=config.local_batch_size, sampler=val_sampler, num_workers=config.num_workers, pin_memory=True) model = SuperMobileNetV2(first_conv_param=Dict(in_channels=3, out_channels=32, kernel_size=3, stride=2), middle_conv_params=[ Dict(in_channels=32, out_channels=16, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=1, stride=1), Dict(in_channels=16, out_channels=24, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=2, stride=2), Dict(in_channels=24, out_channels=32, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=3, stride=2), Dict(in_channels=32, out_channels=64, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=4, stride=2), Dict(in_channels=64, out_channels=96, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=3, stride=1), Dict(in_channels=96, out_channels=160, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=3, stride=2), Dict(in_channels=160, out_channels=320, expand_ratio_list=[3, 6], kernel_size_list=[3, 5], blocks=1, stride=1), ], last_conv_param=Dict(in_channels=320, out_channels=1280, kernel_size=1, stride=1), drop_prob=config.drop_prob, num_classes=1000).cuda() for tensor in model.state_dict().values(): distributed.broadcast(tensor, 0) criterion = CrossEntropyLoss(config.label_smoothing) config.global_batch_size = config.local_batch_size * config.world_size config.lr = config.lr * config.global_batch_size / config.global_batch_denom optimizer = torch.optim.RMSprop(params=model.weights(), lr=config.lr, alpha=config.alpha, eps=config.eps, weight_decay=config.weight_decay, momentum=config.momentum) lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=config.milestones, gamma=config.gamma) last_epoch = -1 global_step = 0 if config.checkpoint: checkpoint = Dict(torch.load(config.checkpoint)) model.load_state_dict(checkpoint.model_state_dict) optimizer.load_state_dict(checkpoint.optimizer_state_dict) last_epoch = checkpoint.last_epoch global_step = checkpoint.global_step elif config.global_rank == 0: if os.path.exists(config.checkpoint_directory): shutil.rmtree(config.checkpoint_directory) if os.path.exists(config.event_directory): shutil.rmtree(config.event_directory) os.makedirs(config.checkpoint_directory) os.makedirs(config.event_directory) if config.global_rank == 0: summary_writer = SummaryWriter(config.event_directory) if config.training: for epoch in range(last_epoch + 1, config.num_epochs): train_sampler.set_epoch(epoch) lr_scheduler.step(epoch) model.train() for local_step, (images, targets) in enumerate(train_data_loader): step_begin = time.time() images = images.cuda(non_blocking=True) targets = targets.cuda(non_blocking=True) logits = model(images) loss = criterion(logits, targets) / config.world_size optimizer.zero_grad() loss.backward() for parameter in model.parameters(): distributed.all_reduce(parameter.grad) optimizer.step() predictions = torch.argmax(logits, dim=1) accuracy = torch.mean( (predictions == targets).float()) / config.world_size for tensor in [loss, accuracy]: distributed.all_reduce(tensor) step_end = time.time() if config.global_rank == 0: summary_writer.add_scalars( main_tag='loss', tag_scalar_dict=dict(train=loss), global_step=global_step) summary_writer.add_scalars( main_tag='accuracy', tag_scalar_dict=dict(train=accuracy), global_step=global_step) print( f'[training] epoch: {epoch} global_step: {global_step} local_step: {local_step} ' f'loss: {loss:.4f} accuracy: {accuracy:.4f} [{step_end - step_begin:.4f}s]' ) global_step += 1 if config.global_rank == 0: torch.save( dict(model_state_dict=model.state_dict(), optimizer_state_dict=optimizer.state_dict(), last_epoch=epoch, global_step=global_step), f'{config.checkpoint_directory}/epoch_{epoch}') if config.validation: model.eval() with torch.no_grad(): average_loss = 0 average_accuracy = 0 for local_step, (images, targets) in enumerate(val_data_loader): images = images.cuda(non_blocking=True) targets = targets.cuda(non_blocking=True) logits = model(images) loss = criterion(logits, targets) / config.world_size predictions = torch.argmax(logits, dim=1) accuracy = torch.mean( (predictions == targets).float()) / config.world_size for tensor in [loss, accuracy]: distributed.all_reduce(tensor) average_loss += loss average_accuracy += accuracy average_loss /= (local_step + 1) average_accuracy /= (local_step + 1) if config.global_rank == 0: summary_writer.add_scalars( main_tag='loss', tag_scalar_dict=dict(val=average_loss), global_step=global_step) summary_writer.add_scalars( main_tag='accuracy', tag_scalar_dict=dict(val=average_accuracy), global_step=global_step) print( f'[validation] epoch: {epoch} loss: {average_loss:.4f} accuracy: {average_accuracy:.4f}' ) elif config.validation: model.eval() with torch.no_grad(): average_loss = 0 average_accuracy = 0 for local_step, (images, targets) in enumerate(val_data_loader): images = images.cuda(non_blocking=True) targets = targets.cuda(non_blocking=True) logits = model(images) loss = criterion(logits, targets) / config.world_size predictions = torch.argmax(logits, dim=1) accuracy = torch.mean( (predictions == targets).float()) / config.world_size for tensor in [loss, accuracy]: distributed.all_reduce(tensor) average_loss += loss average_accuracy += accuracy average_loss /= (local_step + 1) average_accuracy /= (local_step + 1) if config.global_rank == 0: print( f'[validation] epoch: {last_epoch} loss: {average_loss:.4f} accuracy: {average_accuracy:.4f}' ) if config.global_rank == 0: summary_writer.close()
def broadcast_initialized_params(self, src: int = 0): super().broadcast_initialized_params(src) distributed.broadcast(self.input_low, src) distributed.broadcast(self.input_range, src)
def step(self, closure=None): loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group['weight_decay'] momentum = group['momentum'] all_grads = [] for p in group['params']: if p.grad is None: continue d_p = p.grad.data if self.compression_buffer==False: if weight_decay != 0: d_p.add_(weight_decay, p.data) if momentum != 0: # signum param_state = self.state[p] if 'momentum_buffer' not in param_state: buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) else: buf = param_state['momentum_buffer'] buf.mul_(momentum).add_((1 - momentum),d_p) d_p.copy_(buf) all_grads.append(d_p) dev_grads_buckets = _take_tensors(all_grads, self.bucket_size) for dev_grads in dev_grads_buckets: d_p_new = _flatten_dense_tensors(dev_grads) if self.all_reduce: coded, data_time = QSGD_gpu.encode(d_p_new,self.enable_max) tensor_decoded = QSGD_gpu.decode(coded, cuda = True) dist.all_reduce(tensor_decoded, group = 0) tensor_decoded = tensor_decoded / dist.get_world_size() if self.bidirection_compress: if dist.get_rank() == 0: coded, data_time = QSGD_gpu.encode(tensor_decoded,self.enable_max) tensor_decoded = QSGD_gpu.decode(coded, cuda = True) else: tensor_decoded = torch.zeros(tensor_decoded.size()).type_as(tensor_decoded) dist.all_reduce(tensor_decoded, group = 0) d_p_new = tensor_decoded else: if self.nodes > 1: if self.compression_buffer: coded, data_time = QSGD_gpu.encode(d_p_new,self.enable_max) #specific coded dic just on CPU tensor_signs = coded['signs'] tensor_selected = coded['selected'] tensor_norm = coded['norm'] #size tensor_signs_size = self.pack_len_tensor_into_tensor(tensor_signs) tensor_selected_size = self.pack_len_tensor_into_tensor(tensor_selected) #tensor_norm_size = self.pack_len_tensor_into_tensor(tensor_norm) norm doesn't need size #custom ''' print(tensor_signs.type()) print(tensor_selected.type()) print(tensor_norm.type()) ''' else: d_p_new = torch.sign(d_p_new) if self.local_rank == 0: if self.all_gather_commu: #This version only for instances each with one GPU for node_index in self.inter_node_list: if node_index != self.nodes_rank: d.set() coded_temp = coded.copy() b.set() tensor_signs_size_temp = tensor_signs_size.clone() dist.broadcast(tensor_signs_size_temp, node_index, group = self.all_inter_node_group) b.record() c.set() tensor_signs_temp = torch.zeros([int(tensor_signs_size_temp[0])], device = self.device, dtype=torch.int) print('tensor_signs_temp', tensor_signs_temp.size()) c.record() a.set() dist.broadcast(tensor_signs_temp, node_index, group = self.all_inter_node_group) a.record() d.record() e.set() tensor_selected_size_temp = tensor_selected_size.clone() dist.broadcast(tensor_selected_size_temp, node_index, group = self.all_inter_node_group) tensor_selected_temp = torch.zeros([int(tensor_selected_size_temp[0])], device = self.device, dtype=torch.long) dist.broadcast(tensor_selected_temp, node_index, group = self.all_inter_node_group) print('tensor_selected_temp', tensor_selected_temp.size()) e.record() f.set() tensor_norm_temp = tensor_norm.clone() dist.broadcast(tensor_norm_temp, node_index, group = self.all_inter_node_group) coded_temp['signs'] = tensor_signs_temp coded_temp['selected'] = tensor_selected_temp coded_temp['norm'] = tensor_norm_temp tensor_decoded = QSGD_gpu.decode(coded_temp, cuda = True) d_p_new = d_p_new + tensor_decoded f.record() print('a', a.get_time()) print('b', b.get_time()) print('c', c.get_time()) print('d', d.get_time()) print('e', e.get_time()) print('f', f.get_time()) else: dist.broadcast(tensor_signs_size, node_index, group = self.all_inter_node_group) dist.broadcast(tensor_signs, node_index, group = self.all_inter_node_group) dist.broadcast(tensor_selected_size, node_index, group = self.all_inter_node_group) dist.broadcast(tensor_selected, node_index, group = self.all_inter_node_group) dist.broadcast(tensor_norm, node_index, group = self.all_inter_node_group) d_p_new = d_p_new / dist.get_world_size() else: if dist.get_rank() == 0: for index, inter_node_group in enumerate(self.inter_node_group_list): coded_temp = coded.copy() tensor_signs_size_temp = tensor_signs_size.clone() dist.broadcast(tensor_signs_size_temp, self.inter_node_list[index + 1], group = inter_node_group) tensor_signs_temp = torch.randn([int(tensor_signs_size_temp[0])]).type_as(tensor_signs) dist.broadcast(tensor_signs_temp, self.inter_node_list[index + 1], group = inter_node_group) tensor_selected_size_temp = tensor_selected_size.clone() dist.broadcast(tensor_selected_size_temp, self.inter_node_list[index + 1], group = inter_node_group) tensor_selected_temp = torch.randn([int(tensor_selected_size_temp[0])]).type_as(tensor_selected) dist.broadcast(tensor_selected_temp, self.inter_node_list[index + 1], group = inter_node_group) tensor_norm_temp = tensor_norm.clone() dist.broadcast(tensor_norm_temp, self.inter_node_list[index + 1], group = inter_node_group) coded_temp['signs'] = tensor_signs_temp coded_temp['selected'] = tensor_selected_temp coded_temp['norm'] = tensor_norm_temp tensor_decoded = QSGD_gpu.decode(coded_temp, cuda = True) d_p_new = d_p_new + tensor_decoded ''' #temp print(tensor_decoded) tensor_decoded_temp = tensor_decoded.clone() dist.broadcast(tensor_decoded_temp, self.inter_node_list[index + 1], group = inter_node_group) if tensor_decoded == tensor_decoded_temp: print('success') print(tensor_signs_size_temp) print(tensor_selected_size_temp) ''' d_p_new = d_p_new / dist.get_world_size() else: dist.broadcast(tensor_signs_size, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) dist.broadcast(tensor_signs, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) dist.broadcast(tensor_selected_size, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) dist.broadcast(tensor_selected, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) dist.broadcast(tensor_norm, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) ''' #temp tensor_decoded = QSGD_gpu.decode(coded, cuda = True) print(tensor_decoded) dist.broadcast(tensor_decoded, dist.get_rank(), group = self.inter_node_group_list[self.nodes_rank - 1]) print(tensor_signs_size) print(tensor_selected_size) ''' dist.barrier(group = self.all_inter_node_group) #os._exit() if self.bidirection_compress: if dist.get_rank() == 0: coded, data_time = QSGD_gpu.encode(d_p_new,self.enable_max) tensor_signs = coded['signs'] tensor_selected = coded['selected'] tensor_norm = coded['norm'] tensor_signs_size = self.pack_len_tensor_into_tensor(tensor_signs) tensor_selected_size = self.pack_len_tensor_into_tensor(tensor_selected) dist.barrier(group = self.all_inter_node_group) dist.broadcast(tensor_signs_size, 0, group = self.all_inter_node_group) dist.broadcast(tensor_selected_size, 0, group = self.all_inter_node_group) if dist.get_rank() != 0: torch.cuda.synchronize() tensor_signs = torch.randn([int(tensor_signs_size[0])]).type_as(tensor_signs) tensor_selected = torch.randn([int(tensor_selected_size[0])]).type_as(tensor_selected) torch.cuda.synchronize() dist.barrier(group = self.all_inter_node_group) dist.broadcast(tensor_signs, 0, group = self.all_inter_node_group) dist.broadcast(tensor_selected, 0, group = self.all_inter_node_group) dist.broadcast(tensor_norm, 0, group = self.all_inter_node_group) coded['signs'] = tensor_signs coded['selected'] = tensor_selected coded['norm'] = tensor_norm tensor_decoded = QSGD_gpu.decode(coded, cuda = True) d_p_new = tensor_decoded else: if dist.get_rank() == 0: dist.barrier(group = self.all_inter_node_group) dist.broadcast(d_p_new, 0, group = self.all_inter_node_group) else: # test for one coded, data_time = QSGD_gpu.encode(d_p_new,self.enable_max) tensor_decoded = QSGD_gpu.decode(coded, cuda = True) d_p_new = tensor_decoded #unflatten dev_grads_new = _unflatten_dense_tensors(d_p_new,dev_grads) for grad, reduced in zip(dev_grads, dev_grads_new): grad.copy_(reduced) for p in group['params']: if self.compression_buffer: if weight_decay != 0: p.grad.data.add_(weight_decay, p.data) p.data.add_(-group['lr'], p.grad.data) return loss
def __init__(self, module, device_ids=None, output_device=None, dim=0): super(DistributedDataParallel, self).__init__() if device_ids is None: device_ids = list(range(torch.cuda.device_count())) if output_device is None: output_device = device_ids[0] self.dim = dim self.module = module self.device_ids = device_ids self.output_device = output_device # Sync params and buffers for p in self.module.state_dict().values(): dist.broadcast(p, 0) if len(device_ids) > 1: # TODO: we don't need to replicate params in here. they're always going to # be broadcasted using larger blocks in broadcast_coalesced, so it might be # better to not pollute the caches with these small blocks self._module_copies = replicate(self.module, self.device_ids) self._module_copies[0] = self.module for module_copy in self._module_copies[1:]: for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): copy_param.detach_() copy_param.requires_grad = param.requires_grad else: self._module_copies = [self.module] # Split parameters into buckets that will coalesce reductions # TODO: different types need different buckets t = None for p in self.module.parameters(): tp = type(p.data) if t is not None and t is not tp: raise ValueError("DistributedDataParallel requires all parameters' data to be of the same type") t = tp self.bucket_sizes = [] self.bucket_map = {} MB = 1024 * 1024 self.broadcast_bucket_size = 10 * MB # used for param sync before forward bucket_bytes_cap = 1 * MB bucket_bytes = bucket_bytes_cap # to init the first bucket immediately for param_tuple in zip(*map(lambda m: m.parameters(), self._module_copies)): if bucket_bytes >= bucket_bytes_cap: self.bucket_sizes.append(0) bucket_bytes = 0 self.bucket_sizes[-1] += 1 for p in param_tuple: self.bucket_map[p] = len(self.bucket_sizes) - 1 bucket_bytes += p.numel() * p.element_size() self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))] self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))] self.reduced = [False] * len(self.bucket_sizes) self._register_grad_hooks() self.dispatch_lock = threading.Lock() self._start_reduction_threads()
def broadcast_initialized_params(self, src: int = 0): super().broadcast_initialized_params(src) distributed.broadcast(self.scale, src=src) distributed.broadcast(self.signed_tensor, src=src)
def train300_mlperf_coco(args): global torch from coco import COCO # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() args.distributed = False if use_cuda: try: from apex.parallel import DistributedDataParallel as DDP if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 except: raise ImportError("Please install APEX from https://github.com/nvidia/apex") local_seed = args.seed if args.distributed: # necessary pytorch imports import torch.utils.data.distributed import torch.distributed as dist if args.no_cuda: device = torch.device('cpu') else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda') dist.init_process_group(backend='nccl', init_method='env://') # set seeds properly args.seed = broadcast_seeds(args.seed, device) local_seed = (args.seed + dist.get_rank()) % 2**32 mllogger.event(key=mllog_const.SEED, value=local_seed) torch.manual_seed(local_seed) np.random.seed(seed=local_seed) args.rank = dist.get_rank() if args.distributed else args.local_rank print("args.rank = {}".format(args.rank)) print("local rank = {}".format(args.local_rank)) print("distributed={}".format(args.distributed)) dboxes = dboxes300_coco() encoder = Encoder(dboxes) input_size = 300 train_trans = SSDTransformer(dboxes, (input_size, input_size), val=False, num_cropping_iterations=args.num_cropping_iterations) val_trans = SSDTransformer(dboxes, (input_size, input_size), val=True) val_annotate = os.path.join(args.data, "annotations/instances_val2017.json") val_coco_root = os.path.join(args.data, "val2017") train_annotate = os.path.join(args.data, "annotations/instances_train2017.json") train_coco_root = os.path.join(args.data, "train2017") cocoGt = COCO(annotation_file=val_annotate) train_coco = COCODetection(train_coco_root, train_annotate, train_trans) val_coco = COCODetection(val_coco_root, val_annotate, val_trans) mllogger.event(key=mllog_const.TRAIN_SAMPLES, value=len(train_coco)) mllogger.event(key=mllog_const.EVAL_SAMPLES, value=len(val_coco)) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_coco) else: train_sampler = None train_dataloader = DataLoader(train_coco, batch_size=args.batch_size, shuffle=(train_sampler is None), sampler=train_sampler, num_workers=4) # set shuffle=True in DataLoader if args.rank==0: val_dataloader = DataLoader(val_coco, batch_size=args.val_batch_size or args.batch_size, shuffle=False, sampler=None, num_workers=4) else: val_dataloader = None ssd300 = SSD300(train_coco.labelnum, model_path=args.pretrained_backbone) if args.checkpoint is not None: print("loading model checkpoint", args.checkpoint) od = torch.load(args.checkpoint) ssd300.load_state_dict(od["model"]) ssd300.train() if use_cuda: ssd300.cuda() loss_func = Loss(dboxes) if use_cuda: loss_func.cuda() if args.distributed: N_gpu = torch.distributed.get_world_size() else: N_gpu = 1 # parallelize if args.distributed: ssd300 = DDP(ssd300) global_batch_size = N_gpu * args.batch_size mllogger.event(key=mllog_const.GLOBAL_BATCH_SIZE, value=global_batch_size) # Reference doesn't support group batch norm, so bn_span==local_batch_size mllogger.event(key=mllog_const.MODEL_BN_SPAN, value=args.batch_size) current_lr = args.lr * (global_batch_size / 32) assert args.batch_size % args.batch_splits == 0, "--batch-size must be divisible by --batch-splits" fragment_size = args.batch_size // args.batch_splits if args.batch_splits != 1: print("using gradient accumulation with fragments of size {}".format(fragment_size)) current_momentum = 0.9 optim = torch.optim.SGD(ssd300.parameters(), lr=current_lr, momentum=current_momentum, weight_decay=args.weight_decay) ssd_print(key=mllog_const.OPT_BASE_LR, value=current_lr) ssd_print(key=mllog_const.OPT_WEIGHT_DECAY, value=args.weight_decay) iter_num = args.iteration avg_loss = 0.0 inv_map = {v:k for k,v in val_coco.label_map.items()} success = torch.zeros(1) if use_cuda: success = success.cuda() if args.warmup: nonempty_imgs = len(train_coco) wb = int(args.warmup * nonempty_imgs / (N_gpu*args.batch_size)) ssd_print(key=mllog_const.OPT_LR_WARMUP_STEPS, value=wb) warmup_step = lambda iter_num, current_lr: lr_warmup(optim, wb, iter_num, current_lr, args) else: warmup_step = lambda iter_num, current_lr: None ssd_print(key=mllog_const.OPT_LR_WARMUP_FACTOR, value=args.warmup_factor) ssd_print(key=mllog_const.OPT_LR_DECAY_BOUNDARY_EPOCHS, value=args.lr_decay_schedule) mllogger.start( key=mllog_const.BLOCK_START, metadata={mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs}) optim.zero_grad() for epoch in range(args.epochs): mllogger.start( key=mllog_const.EPOCH_START, metadata={mllog_const.EPOCH_NUM: epoch}) # set the epoch for the sampler if args.distributed: train_sampler.set_epoch(epoch) if epoch in args.lr_decay_schedule: current_lr *= 0.1 print("") print("lr decay step #{num}".format(num=args.lr_decay_schedule.index(epoch) + 1)) for param_group in optim.param_groups: param_group['lr'] = current_lr for nbatch, (img, img_id, img_size, bbox, label) in enumerate(train_dataloader): current_batch_size = img.shape[0] # Split batch for gradient accumulation img = torch.split(img, fragment_size) bbox = torch.split(bbox, fragment_size) label = torch.split(label, fragment_size) for (fimg, fbbox, flabel) in zip(img, bbox, label): current_fragment_size = fimg.shape[0] trans_bbox = fbbox.transpose(1,2).contiguous() if use_cuda: fimg = fimg.cuda() trans_bbox = trans_bbox.cuda() flabel = flabel.cuda() fimg = Variable(fimg, requires_grad=True) ploc, plabel = ssd300(fimg) gloc, glabel = Variable(trans_bbox, requires_grad=False), \ Variable(flabel, requires_grad=False) loss = loss_func(ploc, plabel, gloc, glabel) loss = loss * (current_fragment_size / current_batch_size) # weighted mean loss.backward() warmup_step(iter_num, current_lr) optim.step() optim.zero_grad() if not np.isinf(loss.item()): avg_loss = 0.999*avg_loss + 0.001*loss.item() if args.rank == 0 and args.log_interval and not iter_num % args.log_interval: print("Iteration: {:6d}, Loss function: {:5.3f}, Average Loss: {:.3f}"\ .format(iter_num, loss.item(), avg_loss)) iter_num += 1 if (args.val_epochs and (epoch+1) in args.val_epochs) or \ (args.val_interval and not (epoch+1) % args.val_interval): if args.distributed: world_size = float(dist.get_world_size()) for bn_name, bn_buf in ssd300.module.named_buffers(recurse=True): if ('running_mean' in bn_name) or ('running_var' in bn_name): dist.all_reduce(bn_buf, op=dist.ReduceOp.SUM) bn_buf /= world_size ssd_print(key=mllog_const.MODEL_BN_SPAN, value=bn_buf) if args.rank == 0: if not args.no_save: print("") print("saving model...") torch.save({"model" : ssd300.state_dict(), "label_map": train_coco.label_info}, "./models/iter_{}.pt".format(iter_num)) if coco_eval(ssd300, val_dataloader, cocoGt, encoder, inv_map, args.threshold, epoch + 1, iter_num, log_interval=args.log_interval, nms_valid_thresh=args.nms_valid_thresh): success = torch.ones(1) if use_cuda: success = success.cuda() if args.distributed: dist.broadcast(success, 0) if success[0]: return True mllogger.end( key=mllog_const.EPOCH_STOP, metadata={mllog_const.EPOCH_NUM: epoch}) mllogger.end( key=mllog_const.BLOCK_STOP, metadata={mllog_const.FIRST_EPOCH_NUM: 1, mllog_const.EPOCH_COUNT: args.epochs}) return False
def train_network(self, current_progress, overall_progress, time_start, time_end, batch_size=24): path = Path(__file__).parents[1] / 'models' / 'train' / 'train_log.log' logging.basicConfig( format="[%(levelname)s] %(message)s", level=logging.INFO, filename=path, filemode='w+' ) rank = int(os.environ['RANK']) if 'RANK' in os.environ else -1 plots = True # as default adam = False # DDP parameter, do not modify local_rank = -1 save_dir = Path(os.path.join(self.path, 'models')) weights_dir = os.path.join(save_dir, 'train') last = os.path.join(weights_dir, 'last.pt') best = os.path.join(weights_dir, 'best.pt') results_file = os.path.join(save_dir, 'results.txt') self.index_records(weights_dir) self.index_classes(self.path) total_batch_size = batch_size data = check_file(os.path.join(weights_dir, 'data.yaml')) cfg = check_file(os.path.join(weights_dir, 'cfg.yaml')) hyp = check_file(os.path.join(weights_dir, 'hyp.yaml')) cuda = self.device.type != 'cpu' init_seeds(2 + rank) with open(data) as f: data_dict = yaml.load(f, Loader=yaml.SafeLoader) with open(hyp) as f: hyp_dict = yaml.load(f, Loader=yaml.SafeLoader) logger.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp_dict.items())) with torch_distributed_zero_first(rank): check_dataset(data_dict) train_path = data_dict['train'] test_path = data_dict['val'] nc = data_dict['nc'] epochs = 10000 names = data_dict['names'] assert len(names) == nc, '%g names found for nc=%g dataset in %s' % (len(names), nc, data) weights = 'models/train/last.pt' pretrained = weights.endswith('.pt') and self.model is not None if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally checkpoint = torch.load(weights, map_location=self.device) # load checkpoint model = Model(cfg or checkpoint['model'].yaml, ch=3, nc=nc, anchors=hyp_dict.get('anchors')).to(self.device) exclude = ['anchor'] if cfg or hyp_dict.get('anchors') else [] # exclude keys state_dict = checkpoint['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(cfg, ch=3, nc=nc, anchors=hyp_dict.get('anchors')).to(self.device) # create freeze = [] for k, v in model.named_parameters(): v.requires_grad = True if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False nbs = 64 accumulate = max(round(nbs / total_batch_size), 1) hyp_dict['weight_decay'] *= total_batch_size * accumulate / nbs logger.info(f"Scaled weight_decay = {hyp_dict['weight_decay']}") pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay # use adam optimizer, false as default if adam: # adjust beta1 to momentum optimizer = optim.Adam(pg0, lr=hyp_dict['lr0'], betas=(hyp_dict['momentum'], 0.999)) else: optimizer = optim.SGD(pg0, lr=hyp_dict['lr0'], momentum=hyp_dict['momentum'], nesterov=True) # add pg1 with weight_decay optimizer.add_param_group({'params': pg1, 'weight_decay': hyp_dict['weight_decay']}) optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR # false as default linear_lr = False if linear_lr: lf = lambda x: (1 - x / (epochs - 1)) * (1.0 - hyp_dict['lrf']) + hyp_dict['lrf'] # linear else: lf = one_cycle(1, hyp_dict['lrf'], epochs) scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) start_epoch, best_fitness = 0, 0.0 # Image sizes gs = int(model.stride.max()) # grid size (max stride) nl = model.model[-1].nl # number of detection layers (used for scaling hyp_dict['obj']) # verify imgsz are gs-multiples imgsz, imgsz_test = [check_img_size(x, gs) for x in [self.image_size, self.image_size]] # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm, false as default sync_bn = False if sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(self.device) logger.info('Using SyncBatchNorm()') # EMA ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[local_rank], output_device=local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, False, hyp=hyp_dict, rank=rank, prefix=colorstr('train: '), workers=0) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches overall_progress.value = nb * epochs assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % (mlc, nc, data, nc - 1) if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader(test_path, imgsz_test, batch_size * 2, gs, False, hyp=hyp_dict, rect=True, rank=-1, pad=0.5, prefix=colorstr('val: '))[0] labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes if plots: plot_labels(labels, save_dir) # Anchors check_anchors(dataset, model=model, thr=hyp_dict['anchor_t'], imgsz=imgsz) model.half().float() # Model parameters hyp_dict['box'] *= 3. / nl # scale to layers hyp_dict['cls'] *= nc / 80. * 3. / nl # scale to classes and layers hyp_dict['obj'] *= (imgsz / 640) ** 2 * 3. / nl # scale to image size and layers model.nc = nc # attach number of classes to model model.hyp = hyp_dict # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to(self.device) * nc # attach class weights model.names = names # Start training time_start.value = time.time() # number of warmup iterations, max(3 epochs, 1k iterations) nw = max(round(hyp_dict['warmup_epochs'] * nb), 1000) maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) compute_loss = ComputeLoss(model) # init loss class logger.info(f'Image sizes {imgsz} train, {imgsz_test} test\n' f'Using {dataloader.num_workers} dataloader workers\n' f'Logging results to {save_dir}\n' f'Starting training for {epochs} epochs...') for epoch in range(start_epoch, epochs): # epoch ----------------------- model.train() # Update image weights (optional) image_weights = False if image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() mloss = torch.zeros(4, device=self.device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info('%10s' * 8 % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch --------------------------- ni = i + nb * epoch # number integrated batches (since train start) current_progress.value = ni imgs = imgs.to(self.device, non_blocking=True).float() / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max(1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [hyp_dict['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [hyp_dict['warmup_momentum'], hyp_dict['momentum']]) # Multi-scale multi_scale = False if multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss(pred, targets.to(self.device)) # loss scaled by batch_size if rank != -1: # gradient averaged between devices in DDP mode loss *= int(os.environ['WORLD_SIZE']) if 'WORLD_SIZE' in os.environ else 1 # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ( '%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) logger.info(s) # Plot if plots and ni < 3: f = save_dir / f'train_batch{ni}.jpg' # filename Thread(target=plot_images, args=(imgs, targets, paths, f), daemon=True).start() # end batch ----------------------------------------------------- # end epoch --------------------------------------------------------- # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride', 'class_weights']) final_epoch = epoch + 1 == epochs if final_epoch or True: # Calculate mAP results, maps, times = test.test(data, batch_size=batch_size * 2, imgsz=imgsz_test, model=ema.ema, single_cls=False, dataloader=testloader, save_dir=save_dir, verbose=nc < 50 and final_epoch, plots=plots and final_epoch, log_imgs=0, compute_loss=compute_loss) # Write with open(results_file, 'w+') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) bucket = '' if bucket: os.system('gsutil cp %s gs://results/results.txt' % results_file) # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = True if save: with open(results_file, 'r') as f: # create checkpoint checkpoint = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict(), 'wandb_id': None } # Save last, best and delete if current_progress.value % 100: torch.save(checkpoint, last) if best_fitness == fi: torch.save(checkpoint, best) del checkpoint # end epoch --------------------------------------------------------- # end training time_end.value = time.time() - time_start.value self.load_network()
def broadcast_module_itr(args, module: torch.nn.Module, source=0): group = dist.new_group(list(range(args.num_subnet))) for para in module.parameters(): dist.broadcast(para.data, src=source, group=group, async_op=False) dist.destroy_process_group(group)
def sync_parameters(self): for param in self.module.parameters(): dist.broadcast(param.data, 0)
def cluster_memory(model, local_memory_index, local_memory_embeddings, size_dataset, nmb_kmeans_iters=10): j = 0 assignments = -100 * torch.ones(len(args.nmb_prototypes), size_dataset).long() with torch.no_grad(): for i_K, K in enumerate(args.nmb_prototypes): # run distributed k-means # init centroids with elements from memory bank of rank 0 centroids = torch.empty(K, args.feat_dim).cuda(non_blocking=True) if args.rank == 0: random_idx = torch.randperm(len( local_memory_embeddings[j]))[:K] assert len( random_idx) >= K, "please reduce the number of centroids" centroids = local_memory_embeddings[j][random_idx] dist.broadcast(centroids, 0) for n_iter in range(nmb_kmeans_iters + 1): # E step dot_products = torch.mm(local_memory_embeddings[j], centroids.t()) _, local_assignments = dot_products.max(dim=1) # finish if n_iter == nmb_kmeans_iters: break # M step where_helper = get_indices_sparse( local_assignments.cpu().numpy()) counts = torch.zeros(K).cuda(non_blocking=True).int() emb_sums = torch.zeros(K, args.feat_dim).cuda(non_blocking=True) for k in range(len(where_helper)): if len(where_helper[k][0]) > 0: emb_sums[k] = torch.sum( local_memory_embeddings[j][where_helper[k][0]], dim=0, ) counts[k] = len(where_helper[k][0]) dist.all_reduce(counts) mask = counts > 0 dist.all_reduce(emb_sums) centroids[mask] = emb_sums[mask] / counts[mask].unsqueeze(1) # normalize centroids centroids = nn.functional.normalize(centroids, dim=1, p=2) getattr(model.module.prototypes, "prototypes" + str(i_K)).weight.copy_(centroids) # gather the assignments assignments_all = torch.empty(args.world_size, local_assignments.size(0), dtype=local_assignments.dtype, device=local_assignments.device) assignments_all = list(assignments_all.unbind(0)) dist_process = dist.all_gather(assignments_all, local_assignments, async_op=True) dist_process.wait() assignments_all = torch.cat(assignments_all).cpu() # gather the indexes indexes_all = torch.empty(args.world_size, local_memory_index.size(0), dtype=local_memory_index.dtype, device=local_memory_index.device) indexes_all = list(indexes_all.unbind(0)) dist_process = dist.all_gather(indexes_all, local_memory_index, async_op=True) dist_process.wait() indexes_all = torch.cat(indexes_all).cpu() # log assignments assignments[i_K][indexes_all] = assignments_all # next memory bank to use j = (j + 1) % len(args.crops_for_assign) return assignments
def __init__(self, module, device_ids=None, output_device=None, dim=0): super(DistributedDataParallel, self).__init__() if device_ids is None: device_ids = list(range(torch.cuda.device_count())) if output_device is None: output_device = device_ids[0] self.dim = dim self.module = module self.device_ids = device_ids self.output_device = output_device # Sync params and buffers for p in self.module.state_dict().values(): dist.broadcast(p, 0) if len(device_ids) > 1: # TODO: we don't need to replicate params in here. they're always going to # be broadcasted using larger blocks in broadcast_coalesce, so it might be # better to not pollute the caches with these small blocks self._module_copies = replicate(self.module, self.device_ids) self._module_copies[0] = self.module for module_copy in self._module_copies[1:]: for param, copy_param in zip(self.module.parameters(), module_copy.parameters()): copy_param.detach_() copy_param.requires_grad = param.requires_grad else: self._modules_copies = [self.module] # Split parameters into buckets that will coalesce reductions # TODO: different types need different buckets t = None for p in self.module.parameters(): tp = type(p.data) if t is not None and t is not tp: raise ValueError("DistributedDataParallel requires all parameters' data to be of the same type") t = tp self.bucket_sizes = [] self.bucket_map = {} MB = 1024 * 1024 self.broadcast_bucket_size = 10 * MB # used for param sync before forward bucket_bytes_cap = 1 * MB bucket_bytes = bucket_bytes_cap # to init the first bucket immediately for param_tuple in zip(*map(lambda m: m.parameters(), self._module_copies)): if bucket_bytes >= bucket_bytes_cap: self.bucket_sizes.append(0) bucket_bytes = 0 self.bucket_sizes[-1] += 1 for p in param_tuple: self.bucket_map[p] = len(self.bucket_sizes) - 1 bucket_bytes += p.numel() * p.element_size() self.buckets = [[[] for _ in range(len(self.device_ids))] for _ in range(len(self.bucket_sizes))] self.bucket_events = [[None] * len(self.device_ids) for _ in range(len(self.bucket_sizes))] self.reduced = [False] * len(self.bucket_sizes) self._register_grad_hooks() self.dispatch_lock = threading.Lock() self._start_reduction_threads()
def run(args): device = torch.device( 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu') torch.manual_seed(1234) logging.info(f"{args.rank}-th worker starts.") read_start = time.time() f_id_start = args.rank * args.num_files f_id_end = f_id_start + args.num_files f_path_list = [ "{}/{}".format(args.root, i) for i in range(f_id_start, f_id_end) ] f = open(f_path_list[0]).readlines() dataset = DenseLibsvmDataset(f, args.features, args.pos_tag) if len(f_path_list) > 1: for file_name in f_path_list[1:]: f = open(file_name).readlines() dataset.add_more(f) total_count = dataset.__len__() pos_count = 0 for i in range(total_count): if dataset.__getitem__(i)[1] == 1: pos_count += 1 print("{} positive observations out of {}".format(pos_count, total_count)) train_set = np.array(dataset.ins_list) dt = train_set.dtype centroid_shape = (args.num_clusters, train_set.shape[1]) logging.info(f"Loading dataset costs {time.time() - read_start}s") logging.info(f"centorid shape: {centroid_shape}") # initialize centroids init_cent_start = time.time() if args.rank == 0: centroids = torch.tensor(train_set[0:args.num_clusters]) else: centroids = torch.empty(args.num_clusters, args.features) if dist_is_initialized(): dist.broadcast(centroids, 0) logging.info( f"Receiving initial centroids costs {time.time() - init_cent_start}s") training_start = time.time() avg_error = np.iinfo(np.int16).max for epoch in range(args.epochs): if avg_error >= args.threshold: start_compute = time.time() model = Kmeans(train_set, centroids, avg_error, centroid_type='tensor') model.find_nearest_cluster() end_compute = time.time() #logging.info(f"{args.rank}-th worker computing centroids takes {end_compute - start_compute}s") sync_start = time.time() if dist_is_initialized(): centroids, avg_error = broadcast_average( args, model.get_centroids("dense_tensor"), torch.tensor(model.error)) logging.info(f"{args.rank}-th worker finished {epoch} epoch. " f"Computing takes {end_compute - start_compute}s." f"Communicating takes {time.time() - sync_start}s. " #f"Centroids: {model.get_centroids('dense_tensor')}. " f"Loss: {model.error}") else: logging.info( f"{args.rank}-th worker finished training. Error = {avg_error}, centroids = {centroids}" ) logging.info( f"Whole process time : {time.time() - training_start}") return