コード例 #1
0
def worker(args):
    current_network = import_from_file(args.file)

    model = current_network.Net(current_network.Cfg())
    model.train()

    if dist.get_rank() == 0:
        logger.info(get_config_info(model.cfg))
        logger.info(repr(model))

    params_with_grad = []
    for name, param in model.named_parameters():
        if "bottom_up.conv1" in name and model.cfg.backbone_freeze_at >= 1:
            continue
        if "bottom_up.layer1" in name and model.cfg.backbone_freeze_at >= 2:
            continue
        params_with_grad.append(param)

    opt = SGD(
        params_with_grad,
        lr=model.cfg.basic_lr * args.batch_size,
        momentum=model.cfg.momentum,
        weight_decay=model.cfg.weight_decay * dist.get_world_size(),
    )

    gm = GradManager()
    if dist.get_world_size() > 1:
        gm.attach(params_with_grad,
                  callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)])
    else:
        gm.attach(params_with_grad)

    if args.weight_file is not None:
        # model.backbone.bottom_up.load_state_dict(weights, strict=False)
        logger.info("Loading Base-Pretrain weights...")
        weights = mge.load(args.weight_file)
        weight_new = {k: v for k, v in weights.items() if 'pred_' not in k}
        model.load_state_dict(weight_new, strict=False)

    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)  # sync parameters

    if dist.get_rank() == 0:
        logger.info("Prepare dataset")
    train_loader = iter(
        build_dataloader(args.batch_size, args.dataset_dir, model.cfg))

    for epoch in range(model.cfg.max_epoch):
        train_one_epoch(model, train_loader, opt, gm, epoch, args)
        if dist.get_rank() == 0:
            save_path = "logs/{}/epoch_{}.pkl".format(
                os.path.basename(args.file).split(".")[0], epoch)
            mge.save(
                {
                    "epoch": epoch,
                    "state_dict": model.state_dict()
                },
                save_path,
            )
            logger.info("dump weights to %s", save_path)
コード例 #2
0
def test_regression_1762():
    x = F.ones((10, 10, 3, 3))

    conv = M.Conv2d(10, 10, kernel_size=3, padding=1)

    t_shape = (1, 10, 1, 1)
    weight = mge.Parameter(np.ones(t_shape, dtype=np.float32))
    bias = mge.Parameter(np.zeros(t_shape, dtype=np.float32))

    gm = GradManager()
    gm.attach(list(conv.parameters()) + [weight, bias])

    with gm:
        out1 = conv(x)

        out2 = F.batch_norm(
            out1,
            None,
            None,
            weight,
            bias,
            training=True,
        )

        # Weird error only occur when this action is placed after BN
        # Op type is not relevant
        loss = out1 + 1
        gm.backward(loss)
コード例 #3
0
def test_attach_in_with_block():
    a = mge.Parameter([1.0])
    gm = GradManager()
    with gm:
        b = a * 3
        gm.attach(b)
        c = b + 1
        gm.backward(c)
    assert int(b.grad.numpy()) == 1
コード例 #4
0
def test_empty_grad_in_backward():
    x = mge.Parameter(F.full(100, 0.5))
    y = mge.Parameter(F.ones(100))

    gm = GradManager()
    gm.attach([x, y])

    with gm:
        z = F.where(x > 0.7, x, y)
        loss = z.sum()
        gm.backward(loss)
        assert np.all(x.grad.numpy() == 0)
        assert np.all(y.grad.numpy() == 1)
コード例 #5
0
    def f():
        gm = GradManager()
        scaler = GradScaler()

        x = mge.tensor(1.0)
        for _ in range(3):
            with gm:
                y = x + 1
                gm.attach(y)
                loss = y + 1
                scaler.backward(gm, loss, unscale_grad=False)
            np.testing.assert_equal(y.grad.numpy(), scaler.scale_factor)
            scaler.unscale(gm.attached_tensors())
            np.testing.assert_equal(y.grad.numpy(), 1)
        # test handle None elements
        scaler.unscale(gm.attached_tensors())
コード例 #6
0
def test_no_dependency():
    x = mge.tensor(3)

    w = mge.Parameter(1.0)
    w_no_dep = mge.Parameter(1.0)
    gm = GradManager()
    gm.attach(w)
    gm.attach(w_no_dep)

    with gm:
        out1 = x * w
        out2 = w_no_dep * out1
        gm.backward(out1.sum())

    assert w.grad is not None
    assert w_no_dep.grad is None
コード例 #7
0
def test_attach_temporary():
    w = mge.Parameter(2.0)
    gm = GradManager()
    gm.attach(w)

    def cb(x, g):
        assert x is ref()
        cb.called = True

    for i in range(3):
        with gm:
            cb.called = False
            x = mge.Tensor(i, dtype="float32")
            gm.attach(x, callbacks=cb)
            ref = weakref.ref(x)
            y = x * w
            gm.backward(y)
            assert cb.called
        del x
        assert ref() is None
コード例 #8
0
def test_attached_tensors():
    w1 = mge.Parameter(2.0)
    w2 = mge.Parameter(2.0)
    gm = GradManager()

    def check(expected):
        actual = gm.attached_tensors()
        assert len(expected) == len(actual)
        for exp, act in zip(expected, actual):
            assert exp is act

    gm.attach(w1)
    check([w1])
    gm.attach(w2)
    check([w1, w2])
    gm.attach(w1)
    check([w1, w2])
コード例 #9
0
ファイル: train.py プロジェクト: zzh7982/Models
def worker(args):
    current_network = import_from_file(args.file)

    model = current_network.Net(current_network.Cfg())
    model.train()

    if dist.get_rank() == 0:
        logger.info(get_config_info(model.cfg))
        logger.info(repr(model))

    backbone_params = []
    head_params = []
    for name, param in model.named_parameters():
        if "backbone" in name:
            backbone_params.append(param)
        else:
            head_params.append(param)

    opt = SGD(
        [
            {
                "params": backbone_params,
                "lr": model.cfg.learning_rate * 0.1
            },
            {
                "params": head_params
            },
        ],
        lr=model.cfg.learning_rate,
        momentum=model.cfg.momentum,
        weight_decay=model.cfg.weight_decay * dist.get_world_size(),
    )

    gm = GradManager()
    if dist.get_world_size() > 1:
        gm.attach(model.parameters(),
                  callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)])
    else:
        gm.attach(model.parameters())

    cur_epoch = 0
    if args.resume is not None:
        pretrained = mge.load(args.resume)
        cur_epoch = pretrained["epoch"] + 1
        model.load_state_dict(pretrained["state_dict"])
        opt.load_state_dict(pretrained["opt"])
        if dist.get_rank() == 0:
            logger.info("load success: epoch %d", cur_epoch)

    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)  # sync parameters

    if dist.get_rank() == 0:
        logger.info("Prepare dataset")
    train_loader = iter(
        build_dataloader(model.cfg.batch_size, args.dataset_dir, model.cfg))

    for epoch in range(cur_epoch, model.cfg.max_epoch):
        train_one_epoch(model, train_loader, opt, gm, epoch)
        if dist.get_rank() == 0:
            save_path = "log-of-{}/epoch_{}.pkl".format(
                os.path.basename(args.file).split(".")[0], epoch)
            mge.save(
                {
                    "epoch": epoch,
                    "state_dict": model.state_dict(),
                    "opt": opt.state_dict()
                }, save_path)
            logger.info("dump weights to %s", save_path)
コード例 #10
0
ファイル: train.py プロジェクト: zzh7982/Models
def worker(master_ip, port, rank, world_size, args):
    if world_size > 1:
        # Initialize distributed process group
        logger.info("init distributed process group {} / {}".format(rank, world_size))
        dist.init_process_group(
            master_ip=master_ip,
            port=port,
            world_size=world_size,
            rank=rank,
            device=rank,
        )

    model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1])
    save_dir = os.path.join(args.save, model_name)

    model = getattr(kpm, args.arch)()
    model.train()
    start_epoch = 0
    if args.resume is not None:
        file = mge.load(args.resume)
        model.load_state_dict(file["state_dict"])
        start_epoch = file["epoch"]

    optimizer = optim.Adam(
        model.parameters(), lr=cfg.initial_lr, weight_decay=cfg.weight_decay
    )

    gm = GradManager()
    if dist.get_world_size() > 1:
        gm.attach(
            model.parameters(), callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)],
        )
    else:
        gm.attach(model.parameters())

    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)  # sync parameters

    # Build train datasets
    logger.info("preparing dataset..")
    ann_file = os.path.join(
        cfg.data_root, "annotations", "person_keypoints_train2017.json"
    )
    train_dataset = COCOJoints(
        cfg.data_root,
        ann_file,
        image_set="train2017",
        order=("image", "keypoints", "boxes", "info"),
    )
    logger.info("Num of Samples: {}".format(len(train_dataset)))
    train_sampler = data.RandomSampler(
        train_dataset, batch_size=cfg.batch_size, drop_last=True
    )

    transforms = [
        T.Normalize(mean=cfg.img_mean, std=cfg.img_std),
        RandomHorizontalFlip(0.5, keypoint_flip_order=cfg.keypoint_flip_order)
    ]

    if cfg.half_body_transform:
        transforms.append(
            HalfBodyTransform(
                cfg.upper_body_ids, cfg.lower_body_ids, cfg.prob_half_body
            )
        )
    if cfg.extend_boxes:
        transforms.append(
            ExtendBoxes(cfg.x_ext, cfg.y_ext, cfg.input_shape[1] / cfg.input_shape[0])
        )

    transforms += [
        RandomBoxAffine(
            degrees=cfg.rotate_range,
            scale=cfg.scale_range,
            output_shape=cfg.input_shape,
            rotate_prob=cfg.rotation_prob,
            scale_prob=cfg.scale_prob,
        )
    ]
    transforms += [T.ToMode()]

    train_queue = data.DataLoader(
        train_dataset,
        sampler=train_sampler,
        num_workers=args.workers,
        transform=T.Compose(transforms=transforms, order=train_dataset.order,),
        collator=HeatmapCollator(
            cfg.input_shape,
            cfg.output_shape,
            cfg.keypoint_num,
            cfg.heat_thr,
            cfg.heat_kernels if args.multi_scale_supervision else cfg.heat_kernels[-1:],
            cfg.heat_range,
        ),
    )

    # Start training
    for epoch in range(start_epoch, cfg.epochs):
        loss = train(model, train_queue, optimizer, gm, epoch=epoch)
        logger.info("Epoch %d Train %.6f ", epoch, loss)

        if rank == 0 and epoch % cfg.save_freq == 0:  # save checkpoint
            mge.save(
                {"epoch": epoch + 1, "state_dict": model.state_dict()},
                os.path.join(save_dir, "epoch_{}.pkl".format(epoch)),
            )
コード例 #11
0
def worker(master_ip, port, world_size, rank, args):
    if world_size > 1:
        dist.init_process_group(
            master_ip=master_ip,
            port=port,
            world_size=world_size,
            rank=rank,
            device=rank,
        )
        logger.info("Init process group for gpu{} done".format(rank))

    current_network = import_from_file(args.file)

    model = current_network.Net(current_network.Cfg())
    model.train()

    if dist.get_rank() == 0:
        logger.info(get_config_info(model.cfg))
        logger.info(repr(model))

    params_with_grad = []
    for name, param in model.named_parameters():
        if "bottom_up.conv1" in name and model.cfg.backbone_freeze_at >= 1:
            continue
        if "bottom_up.layer1" in name and model.cfg.backbone_freeze_at >= 2:
            continue
        params_with_grad.append(param)

    opt = SGD(
        params_with_grad,
        lr=model.cfg.basic_lr * args.batch_size,
        momentum=model.cfg.momentum,
        weight_decay=model.cfg.weight_decay * dist.get_world_size(),
    )

    gm = GradManager()
    if dist.get_world_size() > 1:
        gm.attach(params_with_grad,
                  callbacks=[dist.make_allreduce_cb("SUM", dist.WORLD)])
    else:
        gm.attach(params_with_grad)

    if args.weight_file is not None:
        weights = mge.load(args.weight_file)
        model.backbone.bottom_up.load_state_dict(weights, strict=False)
    if dist.get_world_size() > 1:
        dist.bcast_list_(model.parameters(), dist.WORLD)  # sync parameters

    if dist.get_rank() == 0:
        logger.info("Prepare dataset")
    train_loader = iter(
        build_dataloader(args.batch_size, args.dataset_dir, model.cfg))

    for epoch in range(model.cfg.max_epoch):
        train_one_epoch(model, train_loader, opt, gm, epoch, args)
        if dist.get_rank() == 0:
            # save_path = "log-of-{}/epoch_{}.pkl".format(
            #     os.path.basename(args.file).split(".")[0], epoch
            # )
            save_path = os.path.join(args.log_dir,
                                     "epoch_{}.pkl".format(epoch))
            mge.save(
                {
                    "epoch": epoch,
                    "state_dict": model.state_dict()
                },
                save_path,
            )
            logger.info("dump weights to %s", save_path)
コード例 #12
0
def test_attach_differentible_tensor_dtype(differentible_dtype):
    gm = GradManager()
    x = mge.tensor([1], dtype=differentible_dtype)
    gm.attach([x])
コード例 #13
0
def test_attach_invalid_tensor_dtype(invalid_dtype):
    gm = GradManager()
    x = mge.tensor([1], dtype=invalid_dtype)
    with pytest.raises(AssertionError):
        gm.attach([x])