def worker(args): current_network = import_from_file(args.file) model = current_network.Net(current_network.Cfg()) model.train() if dist.get_rank() == 0: logger.info(get_config_info(model.cfg)) logger.info(repr(model)) params_with_grad = [] for name, param in model.named_parameters(): if "bottom_up.conv1" in name and model.cfg.backbone_freeze_at >= 1: continue if "bottom_up.layer1" in name and model.cfg.backbone_freeze_at >= 2: continue params_with_grad.append(param) opt = SGD( params_with_grad, lr=model.cfg.basic_lr * args.batch_size, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay * dist.get_world_size(), ) gm = GradManager() if dist.get_world_size() > 1: gm.attach(params_with_grad, callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)]) else: gm.attach(params_with_grad) if args.weight_file is not None: # model.backbone.bottom_up.load_state_dict(weights, strict=False) logger.info("Loading Base-Pretrain weights...") weights = mge.load(args.weight_file) weight_new = {k: v for k, v in weights.items() if 'pred_' not in k} model.load_state_dict(weight_new, strict=False) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters if dist.get_rank() == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(args.batch_size, args.dataset_dir, model.cfg)) for epoch in range(model.cfg.max_epoch): train_one_epoch(model, train_loader, opt, gm, epoch, args) if dist.get_rank() == 0: save_path = "logs/{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch) mge.save( { "epoch": epoch, "state_dict": model.state_dict() }, save_path, ) logger.info("dump weights to %s", save_path)
def test_regression_1762(): x = F.ones((10, 10, 3, 3)) conv = M.Conv2d(10, 10, kernel_size=3, padding=1) t_shape = (1, 10, 1, 1) weight = mge.Parameter(np.ones(t_shape, dtype=np.float32)) bias = mge.Parameter(np.zeros(t_shape, dtype=np.float32)) gm = GradManager() gm.attach(list(conv.parameters()) + [weight, bias]) with gm: out1 = conv(x) out2 = F.batch_norm( out1, None, None, weight, bias, training=True, ) # Weird error only occur when this action is placed after BN # Op type is not relevant loss = out1 + 1 gm.backward(loss)
def test_attach_in_with_block(): a = mge.Parameter([1.0]) gm = GradManager() with gm: b = a * 3 gm.attach(b) c = b + 1 gm.backward(c) assert int(b.grad.numpy()) == 1
def test_empty_grad_in_backward(): x = mge.Parameter(F.full(100, 0.5)) y = mge.Parameter(F.ones(100)) gm = GradManager() gm.attach([x, y]) with gm: z = F.where(x > 0.7, x, y) loss = z.sum() gm.backward(loss) assert np.all(x.grad.numpy() == 0) assert np.all(y.grad.numpy() == 1)
def f(): gm = GradManager() scaler = GradScaler() x = mge.tensor(1.0) for _ in range(3): with gm: y = x + 1 gm.attach(y) loss = y + 1 scaler.backward(gm, loss, unscale_grad=False) np.testing.assert_equal(y.grad.numpy(), scaler.scale_factor) scaler.unscale(gm.attached_tensors()) np.testing.assert_equal(y.grad.numpy(), 1) # test handle None elements scaler.unscale(gm.attached_tensors())
def test_no_dependency(): x = mge.tensor(3) w = mge.Parameter(1.0) w_no_dep = mge.Parameter(1.0) gm = GradManager() gm.attach(w) gm.attach(w_no_dep) with gm: out1 = x * w out2 = w_no_dep * out1 gm.backward(out1.sum()) assert w.grad is not None assert w_no_dep.grad is None
def test_attach_temporary(): w = mge.Parameter(2.0) gm = GradManager() gm.attach(w) def cb(x, g): assert x is ref() cb.called = True for i in range(3): with gm: cb.called = False x = mge.Tensor(i, dtype="float32") gm.attach(x, callbacks=cb) ref = weakref.ref(x) y = x * w gm.backward(y) assert cb.called del x assert ref() is None
def test_attached_tensors(): w1 = mge.Parameter(2.0) w2 = mge.Parameter(2.0) gm = GradManager() def check(expected): actual = gm.attached_tensors() assert len(expected) == len(actual) for exp, act in zip(expected, actual): assert exp is act gm.attach(w1) check([w1]) gm.attach(w2) check([w1, w2]) gm.attach(w1) check([w1, w2])
def worker(args): current_network = import_from_file(args.file) model = current_network.Net(current_network.Cfg()) model.train() if dist.get_rank() == 0: logger.info(get_config_info(model.cfg)) logger.info(repr(model)) backbone_params = [] head_params = [] for name, param in model.named_parameters(): if "backbone" in name: backbone_params.append(param) else: head_params.append(param) opt = SGD( [ { "params": backbone_params, "lr": model.cfg.learning_rate * 0.1 }, { "params": head_params }, ], lr=model.cfg.learning_rate, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay * dist.get_world_size(), ) gm = GradManager() if dist.get_world_size() > 1: gm.attach(model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)]) else: gm.attach(model.parameters()) cur_epoch = 0 if args.resume is not None: pretrained = mge.load(args.resume) cur_epoch = pretrained["epoch"] + 1 model.load_state_dict(pretrained["state_dict"]) opt.load_state_dict(pretrained["opt"]) if dist.get_rank() == 0: logger.info("load success: epoch %d", cur_epoch) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters if dist.get_rank() == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(model.cfg.batch_size, args.dataset_dir, model.cfg)) for epoch in range(cur_epoch, model.cfg.max_epoch): train_one_epoch(model, train_loader, opt, gm, epoch) if dist.get_rank() == 0: save_path = "log-of-{}/epoch_{}.pkl".format( os.path.basename(args.file).split(".")[0], epoch) mge.save( { "epoch": epoch, "state_dict": model.state_dict(), "opt": opt.state_dict() }, save_path) logger.info("dump weights to %s", save_path)
def worker(master_ip, port, rank, world_size, args): if world_size > 1: # Initialize distributed process group logger.info("init distributed process group {} / {}".format(rank, world_size)) dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) model = getattr(kpm, args.arch)() model.train() start_epoch = 0 if args.resume is not None: file = mge.load(args.resume) model.load_state_dict(file["state_dict"]) start_epoch = file["epoch"] optimizer = optim.Adam( model.parameters(), lr=cfg.initial_lr, weight_decay=cfg.weight_decay ) gm = GradManager() if dist.get_world_size() > 1: gm.attach( model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)], ) else: gm.attach(model.parameters()) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters # Build train datasets logger.info("preparing dataset..") ann_file = os.path.join( cfg.data_root, "annotations", "person_keypoints_train2017.json" ) train_dataset = COCOJoints( cfg.data_root, ann_file, image_set="train2017", order=("image", "keypoints", "boxes", "info"), ) logger.info("Num of Samples: {}".format(len(train_dataset))) train_sampler = data.RandomSampler( train_dataset, batch_size=cfg.batch_size, drop_last=True ) transforms = [ T.Normalize(mean=cfg.img_mean, std=cfg.img_std), RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order) ] if cfg.half_body_transform: transforms.append( HalfBodyTransform( cfg.upper_body_ids, cfg.lower_body_ids, cfg.prob_half_body ) ) if cfg.extend_boxes: transforms.append( ExtendBoxes(cfg.x_ext, cfg.y_ext, cfg.input_shape[1] / cfg.input_shape[0]) ) transforms += [ RandomBoxAffine( degrees=cfg.rotate_range, scale=cfg.scale_range, output_shape=cfg.input_shape, rotate_prob=cfg.rotation_prob, scale_prob=cfg.scale_prob, ) ] transforms += [T.ToMode()] train_queue = data.DataLoader( train_dataset, sampler=train_sampler, num_workers=args.workers, transform=T.Compose(transforms=transforms, order=train_dataset.order,), collator=HeatmapCollator( cfg.input_shape, cfg.output_shape, cfg.keypoint_num, cfg.heat_thr, cfg.heat_kernels if args.multi_scale_supervision else cfg.heat_kernels[-1:], cfg.heat_range, ), ) # Start training for epoch in range(start_epoch, cfg.epochs): loss = train(model, train_queue, optimizer, gm, epoch=epoch) logger.info("Epoch %d Train %.6f ", epoch, loss) if rank == 0 and epoch % cfg.save_freq == 0: # save checkpoint mge.save( {"epoch": epoch + 1, "state_dict": model.state_dict()}, os.path.join(save_dir, "epoch_{}.pkl".format(epoch)), )
def worker(master_ip, port, world_size, rank, args): if world_size > 1: dist.init_process_group( master_ip=master_ip, port=port, world_size=world_size, rank=rank, device=rank, ) logger.info("Init process group for gpu{} done".format(rank)) current_network = import_from_file(args.file) model = current_network.Net(current_network.Cfg()) model.train() if dist.get_rank() == 0: logger.info(get_config_info(model.cfg)) logger.info(repr(model)) params_with_grad = [] for name, param in model.named_parameters(): if "bottom_up.conv1" in name and model.cfg.backbone_freeze_at >= 1: continue if "bottom_up.layer1" in name and model.cfg.backbone_freeze_at >= 2: continue params_with_grad.append(param) opt = SGD( params_with_grad, lr=model.cfg.basic_lr * args.batch_size, momentum=model.cfg.momentum, weight_decay=model.cfg.weight_decay * dist.get_world_size(), ) gm = GradManager() if dist.get_world_size() > 1: gm.attach(params_with_grad, callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)]) else: gm.attach(params_with_grad) if args.weight_file is not None: weights = mge.load(args.weight_file) model.backbone.bottom_up.load_state_dict(weights, strict=False) if dist.get_world_size() > 1: dist.bcast_list_(model.parameters(), dist.WORLD) # sync parameters if dist.get_rank() == 0: logger.info("Prepare dataset") train_loader = iter( build_dataloader(args.batch_size, args.dataset_dir, model.cfg)) for epoch in range(model.cfg.max_epoch): train_one_epoch(model, train_loader, opt, gm, epoch, args) if dist.get_rank() == 0: # save_path = "log-of-{}/epoch_{}.pkl".format( # os.path.basename(args.file).split(".")[0], epoch # ) save_path = os.path.join(args.log_dir, "epoch_{}.pkl".format(epoch)) mge.save( { "epoch": epoch, "state_dict": model.state_dict() }, save_path, ) logger.info("dump weights to %s", save_path)
def test_attach_differentible_tensor_dtype(differentible_dtype): gm = GradManager() x = mge.tensor([1], dtype=differentible_dtype) gm.attach([x])
def test_attach_invalid_tensor_dtype(invalid_dtype): gm = GradManager() x = mge.tensor([1], dtype=invalid_dtype) with pytest.raises(AssertionError): gm.attach([x])