def worker(args): current_network = import_from_file(args.file) model = current_network.Net(current_network.Cfg()) model.train() if dist.get_rank() == 0: logger.info(get_config_info(model.cfg)) logger.info(repr(model)) params_with_grad = [] for name, param in model.named_parameters(): if "bottom_up.conv1" in name and model.cfg.backbone_freeze_at >= 1: continue if "bottom_up.layer1" in name and model.cfg.backbone_freeze_at >= 2: continue params_with_grad.append(param) opt = SGD( params_with_grad, lr=model.cfg.basic_lr * args.batch_size, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay * dist.get_world_size(), ) gm = GradManager() if dist.get_world_size() > 1: gm.attach(params_with_grad, callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)]) else: gm.attach(params_with_grad) if args.weight_file is not None: # model.backbone.bottom_up.load_state_dict(weights, strict=False) logger.info("Loading Base-Pretrain weights...") weights = mge.load(args.weight_file) weight_new = {k: v for k, v in weights.items() if 'pred_' not in k} model.load_state_dict(weight_new, strict=False) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters if dist.get_rank() == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(args.batch_size, args.dataset_dir, model.cfg)) for epoch in range(model.cfg.max_epoch): train_one_epoch(model, train_loader, opt, gm, epoch, args) if dist.get_rank() == 0: save_path = "logs/{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch) mge.save( { "epoch": epoch, "state_dict": model.state_dict() }, save_path, ) logger.info("dump weights to %s", save_path)
def __init__(self): super().__init__() self.classifier = None if dist.get_rank() == 0: self.features = M.Sequential( M.ConvBn2d(3, 64, 7, stride=2, padding=3, bias=False), M.MaxPool2d(kernel_size=3, stride=2, padding=1), BasicBlock(64, 64, 1), BasicBlock(64, 64, 1), ) elif dist.get_rank() == 1: self.features = M.Sequential( BasicBlock(64, 128, 2), BasicBlock(128, 128, 1), ) elif dist.get_rank() == 2: self.features = M.Sequential( BasicBlock(128, 256, 2), BasicBlock(256, 256, 1), ) elif dist.get_rank() == 3: self.features = M.Sequential( BasicBlock(256, 512, 2), BasicBlock(512, 512, 1), ) self.classifier = M.Linear(512, 1000)
def worker( arch, model_file, data_root, ann_file, ): """ :param net_file: network description file :param model_file: file of dump weights :param data_dir: the dataset directory :param worker_id: the index of the worker :param total_worker: number of gpu for evaluation """ model = getattr(kpm, arch)() model.eval() weight = mge.load(model_file) weight = weight["state_dict"] if "state_dict" in weight.keys() else weight model.load_state_dict(weight) loader = build_dataloader(dist.get_rank(), dist.get_world_size(), data_root, ann_file) if dist.get_rank() == 0: loader = tqdm(loader) result_list = [] for data_dict in loader: img, bbox, info = data_dict fliped_img = img[:, :, :, ::-1] - np.zeros_like(img) data = np.concatenate([img, fliped_img], 0) data = np.ascontiguousarray(data).astype(np.float32) outs = model.predict(mge.tensor(data)).numpy() preds = outs[:img.shape[0]] preds_fliped = outs[img.shape[0]:, cfg.keypoint_flip_order, :, ::-1] preds = (preds + preds_fliped) / 2 for i in range(preds.shape[0]): results = find_keypoints(preds[i], bbox[i, 0]) final_score = float(results[:, -1].mean() * info[-1][i]) image_id = int(info[-2][i]) keypoints = results.copy() keypoints[:, -1] = 1 keypoints = keypoints.reshape(-1, ).tolist() instance = { "image_id": image_id, "category_id": 1, "score": final_score, "keypoints": keypoints, } result_list.append(instance) return result_list
def forward(self, x): if dist.get_rank() > 0: x = recv_fr_prev_gpu() x = self.features(x) if dist.get_rank() != 3: _ = send_to_next_gpu(x) else: x = F.avg_pool2d(x, 7) x = F.flatten(x, 1) x = self.classifier(x) return x
def train_and_evaluate(model, manager): rank = dist.get_rank() # reload weights from restore_file if specified if args.restore_file is not None: manager.load_checkpoints() world_size = dist.get_world_size() if world_size > 1: dist.bcast_list_(model.parameters()) dist.bcast_list_(model.buffers()) gm = GradManager().attach( model.parameters(), callbacks=dist.make_allreduce_cb("SUM") if world_size > 1 else None, ) for epoch in range(manager.params.num_epochs): # compute number of batches in one epoch (one full pass over the training set) train(model, manager, gm) # Evaluate for one epoch on validation set evaluate(model, manager) # Save best model weights accroding to the params.major_metric if rank == 0: manager.check_best_save_last_checkpoints(latest_freq=5)
def worker(current_network, weight_file, dataset_dir): cfg = current_network.Cfg() cfg.backbone_pretrained = False model = current_network.Net(cfg) model.eval() state_dict = mge.load(weight_file) if "state_dict" in state_dict: state_dict = state_dict["state_dict"] model.load_state_dict(state_dict) def pred_func(data): pred = model(data) return pred test_loader = build_dataloader(dataset_dir, model.cfg) if dist.get_rank() == 0: test_loader = tqdm(test_loader) result_list = [] for data in test_loader: img = data[0].squeeze() label = data[1].squeeze() im_info = data[2] pred = evaluate(pred_func, img, model.cfg) result = {"pred": pred, "gt": label, "name": im_info[2]} result_list.append(result) return result_list
def worker(current_network, weight_file, dataset_dir): cfg = current_network.Cfg() cfg.backbone_pretrained = False model = current_network.Net(cfg) model.eval() state_dict = mge.load(weight_file) if "state_dict" in state_dict: state_dict = state_dict["state_dict"] model.load_state_dict(state_dict) evaluator = DetEvaluator(model) test_loader = build_dataloader(dataset_dir, model.cfg) if dist.get_rank() == 0: test_loader = tqdm(test_loader) result_list = [] for data in test_loader: image, im_info = DetEvaluator.process_inputs( data[0][0], model.cfg.test_image_short_size, model.cfg.test_image_max_size, ) pred_res = evaluator.predict(image=mge.tensor(image), im_info=mge.tensor(im_info)) result = { "pred_boxes": pred_res, "image_id": int(data[1][2][0].split(".")[0].split("_")[-1]), } result_list.append(result) return result_list
def infer(model, data_queue, args): objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() for step, (image, label) in enumerate(data_queue): n = image.shape[0] image = mge.tensor(image, dtype="float32") label = mge.tensor(label, dtype="int32") loss, acc1, acc5 = model(image, label) objs.update(loss.item(), n) top1.update(100 * acc1.item(), n) top5.update(100 * acc5.item(), n) total_time.update(time.time() - t) t = time.time() if step % args.report_freq == 0 and dist.get_rank() == 0: logger.info("Step %d, %s %s %s %s", step, objs, top1, top5, total_time) return objs.avg, top1.avg, top5.avg
def valid(func, data_queue, args): objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") clck = AverageMeter("Time") t = time.time() for step, (image, label) in enumerate(data_queue): image = megengine.tensor(image, dtype="float32") label = megengine.tensor(label, dtype="int32") n = image.shape[0] loss, acc1, acc5 = func(image, label) objs.update(loss.item(), n) top1.update(100 * acc1.item(), n) top5.update(100 * acc5.item(), n) clck.update(time.time() - t, n) t = time.time() if step % args.print_freq == 0 and dist.get_rank() == 0: logging.info("Test step %d, %s %s %s %s", step, objs, top1, top5, clck) return objs.avg, top1.avg, top5.avg
def load_checkpoint(self, path2checkpoint, load_optim=True): """ :param path2checkpoint: e.g. workdirs/xxxxx/checkpoint/epoch_10 :return: dict """ assert osp.exists(path2checkpoint), "{} do not exist".format( path2checkpoint) dirname = osp.split(path2checkpoint)[-1] epoch, nums = dirname.split("_") assert epoch in ("epoch", ) self.logger.info('load checkpoint from {}'.format(path2checkpoint)) # 遍历model中的所有配置optimizer的model,并进行load res = dict() res['nums'] = int(nums) for submodule_name in self.optimizers_cfg.keys(): submodule = getattr(self.model, submodule_name, None) assert submodule is not None, "model should have submodule {}".format( submodule_name) assert isinstance( submodule, Module), "submodule should be instance of mge.module.Module" if dist.get_rank() == 0: module_state_dict = mge.load( osp.join(path2checkpoint, submodule_name + module_ckpt_suffix)) submodule.load_state_dict(module_state_dict, strict=False) if load_optim: optim_state_dict = mge.load( osp.join(path2checkpoint, submodule_name + optim_ckpt_suffix)) res[submodule_name] = optim_state_dict return res
def worker(max_err): net = MnistNet(has_bn=True) net.load_state_dict(checkpoint["net_init"]) lr = checkpoint["sgd_lr"] opt = SGD(net.parameters(), lr=lr) gm = ad.GradManager().attach( net.parameters(), callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)] ) # use same data and label for all gpu's # such that the result does not depend on number of gpu data_train = Tensor(data) label_train = Tensor(label) loss = train(data_train, label_train, net, opt, gm) np.testing.assert_allclose(loss.numpy(), checkpoint["loss"], atol=max_err) if dist.get_rank(): return for param, param_ref in zip( net.state_dict().items(), checkpoint["net_updated"].items() ): assert param[0] == param_ref[0] if "bn" in param[0]: ref = param_ref[1].reshape(param[1].shape) np.testing.assert_allclose(param[1], ref, atol=max_err) else: np.testing.assert_allclose(param[1], param_ref[1], atol=max_err)
def worker(): rank = dist.get_rank() size = dist.get_world_size() x = mge.tensor(np.random.randn(1, rank * 2 + 2), dtype=np.float32) m = M.Linear(rank * 2 + 2, rank * 2 + 4) gm = GradManager().attach(m.parameters()) opt = optim.SGD(m.parameters(), 1e-3, momentum=0.9) def train_func(x): with gm: if rank != 0: x = dist.functional.remote_recv(rank - 1, shape=(1, rank * 2 + 2), dtype=np.float32) y = m(x) if rank != size - 1: dist.functional.remote_send(y, dest_rank=rank + 1) gm.backward() else: y = y.mean() gm.backward(y) opt.step().clear_grad() train_funcs = [ train_func, trace(symbolic=False)(train_func), trace(symbolic=True)(train_func), ] for func in train_funcs: for i in range(3): func(x)
def infer(model, data_queue, args): objs = AverageMeter("Loss") top1 = AverageMeter("Acc@1") top5 = AverageMeter("Acc@5") total_time = AverageMeter("Time") t = time.time() for step, (image, label) in enumerate(data_queue): n = image.shape[0] image = image.astype("float32") # convert np.uint8 to float32 label = label.astype("int32") loss, acc1, acc5 = model(image, label) objs.update(loss.numpy()[0], n) top1.update(100 * acc1.numpy()[0], n) top5.update(100 * acc5.numpy()[0], n) total_time.update(time.time() - t) t = time.time() if step % args.report_freq == 0 and dist.get_rank() == 0: logger.info( "Step %d, %s %s %s %s", step, objs, top1, top5, total_time, ) return objs.avg, top1.avg, top5.avg
def grad_fr_next_gpu(): shape = dist.get_client().user_get( f"grad_shape_of_src{dist.get_rank() + 1}") dtype = dist.get_client().user_get( f"grad_dtype_of_src{dist.get_rank() + 1}") return F.distributed.remote_recv(src_rank=dist.get_rank() + 1, shape=shape, dtype=dtype)
def worker(data, expect): rank = dist.get_rank() inp = tensor(data[rank]) output = gather(inp) if rank == 0: assert np.allclose(output.numpy(), expect[rank]) else: assert output is None
def func(): with GradManager().attach(m.parameters()) as gm: if dist.get_rank() == 0: y = m(x) else: y = x y = F.distributed.broadcast(y) gm.backward(y)
def worker(): rank = dist.get_rank() if rank in ranks: group = dist.new_group(ranks) assert group.size == 2 assert group.key == "2,0" assert group.rank == ranks.index(rank) assert group.comp_node == "gpu{}:2".format(rank)
def worker(data, expect): rank = dist.get_rank() inp = tensor(data[rank]) output = reduce_sum(inp) if rank == 0: assert np.allclose(output.numpy(), expect[rank]) else: assert np.allclose(output.numpy(), 0)
def worker(): rank = dist.get_rank() m = SyncExponentialMovingAverageObserver(momentum=t) y1 = mge.tensor(x1[rank * 3:(rank + 1) * 3]) y2 = mge.tensor(x2[rank * 3:(rank + 1) * 3]) m(y1) m(y2) np.testing.assert_allclose(m.min_val.numpy(), expected_min, atol=1e-6) np.testing.assert_allclose(m.max_val.numpy(), expected_max, atol=1e-6)
def func(): with GradManager().attach(m.parameters()) as gm: y = m(x) y = F.distributed.reduce_sum(y) if dist.get_rank() == 0: loss = (2 * y + 1).mean() gm.backward(loss) else: gm.backward()
def worker(data, yv_expect, running_mean, running_var): rank = dist.get_rank() bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps) for i in range(steps): yv = bn(Tensor(data[rank][i])) _assert_allclose(yv.numpy(), yv_expect[rank]) _assert_allclose(bn.running_mean.numpy(), running_mean) _assert_allclose(bn.running_var.numpy(), running_var)
def worker(): rank = dist.get_rank() if rank in ranks: group = dist.new_group(ranks) assert group.size == 2 assert group.key == "2,0" assert group.rank == ranks.index(rank) dt = get_default_device()[:-1] assert group.comp_node == "{}{}:2".format(dt, rank)
def worker(world_size, args): # pylint: disable=too-many-statements rank = dist.get_rank() if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format( rank, world_size)) model = models.__dict__[args.arch]() if args.mode != "normal": quantize_qat(model, qconfig=Q.ema_fakequant_qconfig) if args.checkpoint: logger.info("Load pretrained weights from %s", args.checkpoint) ckpt = mge.load(args.checkpoint) ckpt = ckpt["state_dict"] if "state_dict" in ckpt else ckpt model.load_state_dict(ckpt, strict=False) if args.mode == "quantized": quantize(model) # Define valid graph def valid_func(image, label): model.eval() logits = model(image) loss = F.loss.cross_entropy(logits, label, label_smooth=0.1) acc1, acc5 = F.topk_accuracy(logits, label, (1, 5)) if dist.is_distributed(): # all_reduce_mean loss = dist.functional.all_reduce_sum(loss) / dist.get_world_size() acc1 = dist.functional.all_reduce_sum(acc1) / dist.get_world_size() acc5 = dist.functional.all_reduce_sum(acc5) / dist.get_world_size() return loss, acc1, acc5 # Build valid datasets logger.info("preparing dataset..") valid_dataset = data.dataset.ImageNet(args.data, train=False) valid_sampler = data.SequentialSampler(valid_dataset, batch_size=100, drop_last=False) valid_queue = data.DataLoader( valid_dataset, sampler=valid_sampler, transform=T.Compose([ T.Resize(256), T.CenterCrop(224), T.Normalize(mean=128), T.ToMode("CHW") ]), num_workers=args.workers, ) _, valid_acc, valid_acc5 = infer(valid_func, valid_queue, args) if rank == 0: logger.info("TEST %f, %f", valid_acc, valid_acc5)
def worker(rank, backend, q): if not mge.is_cuda_available(): return _init_process_group_wrapper(world_size, rank, rank, backend, q) assert dist.is_distributed() == True assert dist.get_master_ip() == _LOCALHOST assert dist.get_master_port() > 0 assert dist.get_world_size() == world_size assert dist.get_rank() == rank assert dist.get_backend() == backend
def worker(val, shape): rank = dist.get_rank() if rank == 0: # remote send x = tensor(val, device="xpu0") remote_send(x, 1) sync() else: # remote recv y = remote_recv(0) assert y.device == get_default_device() np.testing.assert_almost_equal(val, y.numpy())
def worker(val, shape): rank = dist.get_rank() if rank == 0: # remote send x = tensor(val, device="gpu0") remote_send(x, 1) sync() else: # remote recv y = remote_recv(0, shape, np.float32) assert y.device == "gpu1" np.testing.assert_almost_equal(val, y.numpy())
def main(params): mge.dtr.eviction_threshold = "5GB" mge.dtr.enable() rank = dist.get_rank() # Set the logger logger = utils.set_logger(os.path.join(params.model_dir, 'train.log')) # Set the tensorboard writer log_dir = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") tb_dir = os.path.join(params.model_dir, "summary") os.makedirs(tb_dir, exist_ok=True) writter = SummaryWriter(log_dir=tb_dir) # Create the input data pipeline if rank == 0: logger.info("Loading the datasets from {}".format(params.data_dir)) # fetch dataloaders dataloaders = data_loader.fetch_dataloader(params) # Define the model and optimizer model = net.fetch_net(params) # add regulizer to weights and bias param_groups = [ { "params": utils.bias_parameters(model) }, { "params": utils.weight_parameters(model), "weight_decay": 1e-6 }, ] optimizer = Adam(param_groups, lr=params.learning_rate, eps=1e-7) milestones = [50, 150, 250, 350, 450] scheduler = MultiStepLR(optimizer, milestones, 0.5) # initial status for checkpoint manager manager = Manager(model=model, optimizer=optimizer, scheduler=scheduler, params=params, dataloaders=dataloaders, writer=writter, logger=logger) # Train the model if rank == 0: logger.info("Starting training for {} epoch(s)".format( params.num_epochs)) train_and_evaluate(model, manager)
def worker(data, expect): rank = dist.get_rank() inp = tensor(data[rank]) def func(): output = scatter(inp, axis=axis) return output func = trace(symbolic=symbolic)(func) output = func() assert np.allclose(output.numpy(), expect[rank])
def __init__( self, dataset, batch_size=1, drop_last=False, num_samples=None, world_size=None, rank=None, seed=None, ): if ( not isinstance(batch_size, int) or isinstance(batch_size, bool) or batch_size <= 0 ): raise ValueError( "batch_size should be a positive integer value, " "but got batch_size={}".format(batch_size) ) if not isinstance(drop_last, bool): raise ValueError( "drop_last should be a boolean value, but got " "drop_last={}".format(drop_last) ) if num_samples is not None and ( not isinstance(num_samples, int) or isinstance(num_samples, bool) or num_samples <= 0 ): raise ValueError( "num_samples should be a positive integer " "value, but got num_samples={}".format(num_samples) ) self.batch_size = batch_size self.dataset = dataset self.drop_last = drop_last if world_size is None: world_size = dist.get_world_size() if dist.is_distributed() else 1 self.world_size = world_size if rank is None: rank = dist.get_rank() if dist.is_distributed() else 0 self.rank = rank if num_samples is None: num_samples = len(self.dataset) self.num_samples = int(math.ceil(num_samples / self.world_size)) # Make sure seeds are the same at each rank if seed is None and self.world_size > 1: seed = 0 self.rng = np.random.RandomState(seed)
def train_one_epoch(): def train_func(images, labels): opt.clear_grad() with gm: loss, accuracy, _ = model(images, labels) gm.backward(loss) if dist.is_distributed(): # all_reduce_mean loss = dist.functional.all_reduce_sum( loss) / dist.get_world_size() accuracy = dist.functional.all_reduce_sum( accuracy) / dist.get_world_size() opt.step() return loss, accuracy model.train() average_loss = AverageMeter("loss") average_accuracy = AverageMeter("accuracy") data_time = AverageMeter("data_time") train_time = AverageMeter("train_time") total_step = len(train_queue) data_iter = iter(train_queue) for step in range(total_step): # get next batch of data data_tic = time.time() images, labels = next(data_iter) data_toc = time.time() # forward pass & backward pass train_tic = time.time() images = mge.tensor(images, dtype="float32") labels = mge.tensor(labels, dtype="int32") loss, accuracy = train_func(images, labels) train_toc = time.time() # do the statistics and logging n = images.shape[0] average_loss.update(loss.item(), n) average_accuracy.update(accuracy.item() * 100, n) data_time.update(data_toc - data_tic) train_time.update(train_toc - train_tic) if step % configs["log_interval"] == 0 and dist.get_rank() == 0: logger.info( "epoch: %d, step: %d, %s, %s, %s, %s", epoch, step, average_loss, average_accuracy, data_time, train_time, )