def test_reduce_sum():
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank, data, expect, port):
        if mge.get_device_count("gpu") < world_size:
            return
        dist.init_process_group("localhost", port, world_size, rank, rank)
        inp = tensor(data)
        output = reduce_sum(inp)
        if rank == 0:
            assert np.allclose(output.numpy(), expect)
        else:
            assert np.allclose(output.numpy(), 0)

    def check(shape):
        x = np.random.rand(*shape).astype("float32")
        y = np.random.rand(*shape).astype("float32")
        z = x + y
        p0 = mp.Process(target=worker, args=(0, x, z, port))
        p1 = mp.Process(target=worker, args=(1, y, None, port))

        p0.start()
        p1.start()

        p0.join(10)
        p1.join(10)

        assert p0.exitcode == 0 and p1.exitcode == 0

    for shape in [(2, 3), (8, 10), (99, 77)]:
        check(shape)
def test_all_to_all():
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank, data, expect, port):
        if mge.get_device_count("gpu") < world_size:
            return
        dist.init_process_group("localhost", port, world_size, rank, rank)
        inp = tensor(data)
        output = all_to_all(inp)
        assert np.allclose(output.numpy(), expect)

    def check(shape):
        x = np.random.rand(*shape).astype("float32")
        y = np.random.rand(*shape).astype("float32")
        a = np.concatenate((x[:shape[0] // 2], y[:shape[0] // 2]))
        b = np.concatenate((x[shape[0] // 2:], y[shape[0] // 2:]))
        p0 = mp.Process(target=worker, args=(0, x, a, port))
        p1 = mp.Process(target=worker, args=(1, y, b, port))

        p0.start()
        p1.start()

        p0.join(10)
        p1.join(10)

        assert p0.exitcode == 0 and p1.exitcode == 0

    for shape in [(2, 3), (8, 10), (100, 77)]:
        check(shape)
def test_broadcast():
    world_size = 2
    server = dist.Server()
    port = server.py_server_port

    def worker(rank, data, expect, port):
        if mge.get_device_count("gpu") < world_size:
            return
        dist.init_process_group("localhost", port, world_size, rank, rank)
        inp = tensor(data)
        output = broadcast(inp)
        assert np.allclose(output.numpy(), expect)

    def check(shape):
        x = np.random.rand(*shape).astype("float32")
        y = x + 1
        p0 = mp.Process(target=worker, args=(0, x, x, port))
        p1 = mp.Process(target=worker, args=(1, y, x, port))

        p0.start()
        p1.start()

        p0.join(10)
        p1.join(10)

        assert p0.exitcode == 0 and p1.exitcode == 0

    for shape in [(2, 3), (8, 10), (99, 77)]:
        check(shape)
def run_test(
    model_path, use_jit, use_symbolic, sublinear_memory_config=None, max_err=None,
):

    """
    Load the model with test cases and run the training for one iter.
    The loss and updated weights are compared with reference value to verify the correctness.

    Dump a new file with updated result by calling update_model
    if you think the test fails due to numerical rounding errors instead of bugs.
    Please think twice before you do so.

    """
    checkpoint = mge.load(model_path)
    data = checkpoint["data"]
    label = checkpoint["label"]
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank, max_err):
        dist.init_process_group("localhost", port, p_num, rank, rank)
        net = MnistNet(has_bn=True)
        net.load_state_dict(checkpoint["net_init"])
        lr = checkpoint["sgd_lr"]
        opt = SGD(net.parameters(), lr=lr)

        gm = ad.GradManager().attach(
            net.parameters(), callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)]
        )

        # use same data and label for all gpu's
        # such that the result does not depend on number of gpu
        data_train = Tensor(data)
        label_train = Tensor(label)

        loss = train(data_train, label_train, net, opt, gm)

        np.testing.assert_allclose(loss.numpy(), checkpoint["loss"], atol=max_err)

        if dist.get_rank():
            return
        for param, param_ref in zip(
            net.state_dict().items(), checkpoint["net_updated"].items()
        ):
            assert param[0] == param_ref[0]
            if "bn" in param[0]:
                ref = param_ref[1].reshape(param[1].shape)
                np.testing.assert_allclose(param[1], ref, atol=max_err)
            else:
                np.testing.assert_allclose(param[1], param_ref[1], atol=max_err)

    procs = []
    for rank in range(p_num):
        p = mp.Process(target=worker, args=(rank, max_err,))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(20)
        assert p.exitcode == 0
def test_synchronized():
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    @dist.synchronized
    def func(rank, q):
        q.put(rank)

    def worker(rank, q):
        dist.init_process_group("localhost", port, world_size, rank, rank)
        dist.group_barrier()
        if rank == 0:
            func(0, q)  # q.put(0)
            q.put(2)
        else:
            _assert_q_val(q, 0)  # func executed in rank 0
            _assert_q_empty(q)  # q.put(2) is not executed
            func(1, q)
            _assert_q_val(
                q,
                1)  # func in rank 1 executed earlier than q.put(2) in rank 0
            _assert_q_val(q, 2)  # q.put(2) executed in rank 0

    Q = mp.Queue()
    procs = []
    for rank in range(world_size):
        p = mp.Process(target=worker, args=(rank, Q))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(20)
        assert p.exitcode == 0
def test_init_process_group():
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank, backend):
        dist.init_process_group("localhost", port, world_size, rank, rank,
                                backend)
        assert dist.is_distributed() == True
        assert dist.get_rank() == rank
        assert dist.get_world_size() == world_size
        assert dist.get_backend() == backend

        py_server_addr = dist.get_py_server_addr()
        assert py_server_addr[0] == "localhost"
        assert py_server_addr[1] == port

        mm_server_addr = dist.get_mm_server_addr()
        assert mm_server_addr[0] == "localhost"
        assert mm_server_addr[1] > 0

        assert isinstance(dist.get_client(), dist.Client)

    def check(backend):
        procs = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, backend))
            p.start()
            procs.append(p)

        for p in procs:
            p.join(20)
            assert p.exitcode == 0

    check("nccl")
Beispiel #7
0
def main():
    parser = make_parser()
    args = parser.parse_args()

    # ------------------------ begin training -------------------------- #
    logger.info("Device Count = %d", args.ngpus)

    log_dir = "log-of-{}".format(os.path.basename(args.file).split(".")[0])
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir)

    if args.ngpus > 1:
        master_ip = "localhost"
        port = dist.get_free_ports(1)[0]
        dist.Server(port)
        processes = list()
        for rank in range(args.ngpus):
            process = mp.Process(
                target=worker, args=(master_ip, port, args.ngpus, rank, args)
            )
            process.start()
            processes.append(process)

        for p in processes:
            p.join()
    else:
        worker(None, None, 1, 0, args)
def test_io_remote():
    world_size = 2
    server = dist.Server()
    port = server.py_server_port
    val = np.random.rand(4, 5).astype(np.float32)

    def worker(rank):
        if mge.get_device_count("gpu") < world_size:
            return
        if rank == 0:  # remote send
            dist.init_process_group("localhost", port, world_size, rank, rank)
            x = Tensor(val, device="gpu0")
            y = remote_send(x, 1)
            assert y.numpy()[0] == 0
        else:  # remote recv
            dist.init_process_group("localhost", port, world_size, rank, rank)
            y = remote_recv(0, val.shape, val.dtype)
            assert y.device == "gpu1"
            np.testing.assert_almost_equal(val, y.numpy())

    procs = []
    for rank in range(world_size):
        p = mp.Process(target=worker, args=(rank, ))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(10)
        assert p.exitcode == 0
def test_group_barrier():
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank, q):
        dist.init_process_group("localhost", port, world_size, rank, rank)
        dist.group_barrier()
        if rank == 0:
            dist.group_barrier()
            q.put(0)  # to be observed in rank 1
        else:
            _assert_q_empty(q)  # q.put(0) is not executed in rank 0
            dist.group_barrier()
            _assert_q_val(q, 0)  # q.put(0) executed in rank 0

    Q = mp.Queue()
    procs = []
    for rank in range(world_size):
        p = mp.Process(target=worker, args=(rank, Q))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(20)
        assert p.exitcode == 0
Beispiel #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-f", "--file", default="net.py", type=str, help="net description file"
    )
    parser.add_argument(
        "-w", "--weight_file", default=None, type=str, help="weights file",
    )
    parser.add_argument(
        "-n", "--devices", default=1, type=int, help="total number of gpus for testing",
    )
    parser.add_argument(
        "-d", "--dataset_dir", default="/data/datasets", type=str,
    )
    args = parser.parse_args()

    current_network = import_from_file(args.file)
    cfg = current_network.Cfg()

    result_list = []
    if args.devices > 1:
        result_queue = Queue(500)

        master_ip = "localhost"
        server = dist.Server()
        port = server.py_server_port

        procs = []
        for i in range(args.devices):
            proc = Process(
                target=worker,
                args=(
                    current_network,
                    args.weight_file,
                    args.dataset_dir,
                    result_queue,
                    master_ip,
                    port,
                    args.devices,
                    i,
                ),
            )
            proc.start()
            procs.append(proc)

        num_imgs = dict(VOC2012=1449, Cityscapes=500)

        for _ in tqdm(range(num_imgs[cfg.dataset])):
            result_list.append(result_queue.get())

        for p in procs:
            p.join()
    else:
        worker(current_network, args.weight_file, args.dataset_dir, result_list)

    if cfg.val_save_path is not None:
        save_results(result_list, cfg.val_save_path, cfg)
    logger.info("Start evaluation!")
    compute_metric(result_list, cfg)
Beispiel #11
0
def main():
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    args = parse_args()
    cfg = Config.fromfile(args.config)
    cfg.dynamic = args.dynamic
    cfg.ensemble = args.ensemble
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    else:
        assert cfg.get(
            'work_dir', None
        ) is not None, 'if do not set work_dir in args, please set in config file'

    cfg.work_dir = os.path.join(cfg.work_dir, timestamp)
    mkdir_or_exist(os.path.abspath(cfg.work_dir))

    log_file = os.path.join(cfg.work_dir, 'root.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
    logger.info('Config:\n{}'.format(cfg.text))

    gpu_list = [item.strip() for item in args.gpuids.split(",")]
    if gpu_list[0] == "-1":
        world_size = 0  # use cpu
        logger.info('test use only cpu')
    else:
        world_size = len(gpu_list)
        logger.info('test gpus num: {}'.format(world_size))

    # assert world_size <= mge.get_device_count("gpu")

    if world_size == 0:  # use cpu
        mge.set_default_device(device='cpux')
    elif world_size == 1:
        mge.set_default_device(device='gpu' + gpu_list[0])
    else:
        pass

    if world_size > 1:
        port = dist.util.get_free_ports(1)[0]
        server = dist.Server(port)
        processes = []
        for rank in range(world_size):
            logger.info("init distributed process group {} / {}".format(
                rank, world_size))
            p = mp.Process(target=worker,
                           args=(rank, world_size, cfg, gpu_list[rank], port))
            p.start()
            processes.append(p)

        for rank in range(world_size):
            processes[rank].join()
            code = processes[rank].exitcode
            assert code == 0, "subprocess {} exit with code {}".format(
                rank, code)
    else:
        worker(0, 1, cfg)
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-a",
        "--arch",
        default="simplebaseline_res50",
        type=str,
        choices=cfg.model_choices,
    )
    parser.add_argument("-s", "--save", default="/data/models", type=str)
    parser.add_argument("-b", "--batch_size", default=32, type=int)
    parser.add_argument("-lr", "--initial_lr", default=3e-4, type=float)

    parser.add_argument("--resume", default=None, type=str)

    parser.add_argument("--multi_scale_supervision", action="store_true")

    parser.add_argument("-n", "--ngpus", default=8, type=int)
    parser.add_argument("-w", "--workers", default=8, type=int)

    args = parser.parse_args()

    model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1])
    save_dir = os.path.join(args.save, model_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    mge.set_log_file(os.path.join(save_dir, "log.txt"))

    if args.batch_size != cfg.batch_size:
        cfg.batch_size = args.batch_size
    if args.initial_lr != cfg.initial_lr:
        cfg.initial_lr = args.initial_lr

    world_size = mge.get_device_count("gpu") if args.ngpus is None else args.ngpus

    if world_size > 1:
        # scale learning rate by number of gpus
        master_ip = "localhost"

        port = dist.get_free_ports(1)[0]
        dist.Server(port)

        cfg.weight_decay *= world_size
        # start distributed training, dispatch sub-processes
        processes = []
        for rank in range(world_size):
            p = mp.Process(
                target=worker, args=(master_ip, port, rank, world_size, args)
            )
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(None, None, 0, 1, args)
Beispiel #13
0
def test_syncbn(enable_amp):
    nr_chan = 8
    data_shape = (3, nr_chan, 4, 16)
    momentum = 0.9
    eps = 1e-5
    running_mean = np.zeros((1, nr_chan, 1, 1), dtype=np.float32)
    running_var = np.ones((1, nr_chan, 1, 1), dtype=np.float32)
    steps = 4
    nr_ranks = 2
    server = dist.Server()
    port = server.py_server_port

    @dist.launcher(n_gpus=2)
    def worker(data, yv_expect, running_mean, running_var):
        with amp.autocast(enabled=enable_amp):
            rank = dist.get_rank()
            bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps)
            for i in range(steps):
                yv = bn(Tensor(data[rank][i]))
        if enable_amp:
            np.testing.assert_allclose(
                yv.numpy(), yv_expect[rank], atol=5e-4, rtol=5e-4
            )
        else:
            _assert_allclose(yv.numpy(), yv_expect[rank])
        _assert_allclose(bn.running_mean.numpy(), running_mean)
        _assert_allclose(bn.running_var.numpy(), running_var)

    xv = []
    for i in range(steps):
        xv.append(np.random.normal(loc=2.3, size=data_shape).astype(np.float32))
        xv_transposed = np.transpose(xv[i], [0, 2, 3, 1]).reshape(
            (data_shape[0] * data_shape[2] * data_shape[3], nr_chan)
        )

        mean = np.mean(xv_transposed, axis=0).reshape(1, nr_chan, 1, 1)

        var_biased = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1, 1))
        sd = np.sqrt(var_biased + eps)

        var_unbiased = np.var(xv_transposed, axis=0, ddof=1).reshape((1, nr_chan, 1, 1))
        running_mean = running_mean * momentum + mean * (1 - momentum)
        running_var = running_var * momentum + var_unbiased * (1 - momentum)

        yv_expect = (xv[i] - mean) / sd

    data = []
    for i in range(nr_ranks):
        data.append([])
        for j in range(steps):
            data[i].append(xv[j][:, :, :, i * 8 : i * 8 + 8])

    yv_expect = [yv_expect[:, :, :, i * 8 : i * 8 + 8] for i in range(nr_ranks)]

    worker(data, yv_expect, running_mean, running_var)
Beispiel #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-f",
                        "--file",
                        default="net.py",
                        type=str,
                        help="net description file")
    parser.add_argument("-n",
                        "--ngpus",
                        type=int,
                        default=8,
                        help="batch size for training")
    parser.add_argument(
        "-d",
        "--dataset_dir",
        type=str,
        default="/data/datasets",
    )
    parser.add_argument("-r",
                        "--resume",
                        type=str,
                        default=None,
                        help="resume model file")
    args = parser.parse_args()

    # ------------------------ begin training -------------------------- #
    logger.info("Device Count = %d", args.ngpus)

    log_dir = "log-of-{}".format(os.path.basename(args.file).split(".")[0])
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir)

    if args.ngpus > 1:
        master_ip = "localhost"
        port = dist.get_free_ports(1)[0]
        dist.Server(port)
        processes = list()
        for rank in range(args.ngpus):
            process = mp.Process(target=worker,
                                 args=(master_ip, port, args.ngpus, rank,
                                       args))
            process.start()
            processes.append(process)

        for p in processes:
            p.join()
    else:
        worker(None, None, 1, 0, args)
Beispiel #15
0
def test_dist_grad():
    world_size = 2
    x_np = np.random.rand(10).astype("float32")
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker0():
        dist.init_process_group("localhost", port, world_size, 0, 0)
        mge.device.set_default_device("gpu0")
        grad = Grad()

        x = as_tensor(x_np)
        grad.wrt(x, callback=save_to(x))
        # need a placeholder to trace operator
        send_x = remote_send(x, 1)
        recv_x = remote_recv(1, x_np.shape, x_np.dtype, "gpu0")
        y = recv_x * recv_x

        grad([y], [as_tensor(np.ones_like(x_np))])
        np.testing.assert_almost_equal(x.grad.numpy(), x.numpy() * 2)

    def worker1():
        dist.init_process_group("localhost", port, world_size, 1, 1)
        mge.device.set_default_device("gpu1")
        grad = Grad()

        recv_x = remote_recv(0, x_np.shape, x_np.dtype, "gpu1")
        send_x = remote_send(recv_x, 0)

        grad([], [])

        # sync because grad has a send operator
        sync()
        send_x.device._cn._sync_all()

    import multiprocessing as mp

    p0 = mp.Process(target=worker0)
    p1 = mp.Process(target=worker1)
    p0.start()
    p1.start()
    p0.join(10)
    p1.join(10)
    assert p0.exitcode == 0 and p1.exitcode == 0
def test_user_set_get():
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank):
        dist.init_process_group("localhost", port, world_size, rank, rank)
        # set in race condition
        dist.get_client().user_set("foo", 1)
        # get in race condition
        ret = dist.get_client().user_get("foo")
        assert ret == 1

    procs = []
    for rank in range(world_size):
        p = mp.Process(target=worker, args=(rank, ))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(20)
        assert p.exitcode == 0
Beispiel #17
0
def main(args):
    configs = load_config_from_path(args.config_file)

    num_devices = dist.helper.get_device_count_by_fork("gpu")
    if num_devices > 1:
        # distributed training
        master_ip = "localhost"
        port = dist.get_free_ports(1)[0]
        dist.Server(port)
        processes = []
        for rank in range(num_devices):
            process = mp.Process(target=worker,
                                 args=(master_ip, port, num_devices, rank,
                                       configs))
            process.start()
            processes.append(process)

        for p in processes:
            p.join()
    else:
        # non-distributed training
        worker(None, None, 1, 0, configs)
Beispiel #18
0
def test_sync_min_max_observer():
    x = np.random.rand(6, 3, 3, 3).astype("float32")
    np_min, np_max = x.min(), x.max()
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank, slc):
        dist.init_process_group("localhost", port, world_size, rank, rank)
        m = ob.SyncMinMaxObserver()
        y = mge.tensor(x[slc])
        m(y)
        assert m.min_val == np_min and m.max_val == np_max

    procs = []
    for rank in range(world_size):
        slc = slice(rank * 3, (rank + 1) * 3)
        p = mp.Process(target=worker, args=(rank, slc,), daemon=True)
        p.start()
        procs.append(p)
    for p in procs:
        p.join(20)
        assert p.exitcode == 0
def test_new_group():
    world_size = 3
    ranks = [2, 0]
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank):
        dist.init_process_group("localhost", port, world_size, rank, rank)
        if rank in ranks:
            group = dist.new_group(ranks)
            assert group.size == 2
            assert group.key == "2,0"
            assert group.rank == ranks.index(rank)
            assert group.comp_node == "gpu{}:2".format(rank)

    procs = []
    for rank in range(world_size):
        p = mp.Process(target=worker, args=(rank, ))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(20)
        assert p.exitcode == 0
Beispiel #20
0
def main():
    # pylint: disable=import-outside-toplevel,too-many-branches,too-many-statements
    from pycocotools.coco import COCO
    from pycocotools.cocoeval import COCOeval

    parser = make_parser()
    args = parser.parse_args()

    current_network = import_from_file(args.file)
    cfg = current_network.Cfg()

    if args.weight_file:
        args.start_epoch = args.end_epoch = -1
    else:
        if args.start_epoch == -1:
            args.start_epoch = cfg.max_epoch - 1
        if args.end_epoch == -1:
            args.end_epoch = args.start_epoch
        assert 0 <= args.start_epoch <= args.end_epoch < cfg.max_epoch

    for epoch_num in range(args.start_epoch, args.end_epoch + 1):
        if args.weight_file:
            weight_file = args.weight_file
        else:
            weight_file = "log-of-{}/epoch_{}.pkl".format(
                os.path.basename(args.file).split(".")[0], epoch_num)

        result_list = []
        if args.devices > 1:
            result_queue = Queue(2000)

            master_ip = "localhost"
            server = dist.Server()
            port = server.py_server_port
            procs = []
            for i in range(args.devices):
                proc = Process(
                    target=worker,
                    args=(
                        current_network,
                        weight_file,
                        args.dataset_dir,
                        result_queue,
                        master_ip,
                        port,
                        args.devices,
                        i,
                    ),
                )
                proc.start()
                procs.append(proc)

            num_imgs = dict(coco=5000, objects365=30000)

            for _ in tqdm(range(num_imgs[cfg.test_dataset["name"]])):
                result_list.append(result_queue.get())

            for p in procs:
                p.join()
        else:
            worker(current_network, weight_file, args.dataset_dir, result_list)

        all_results = DetEvaluator.format(result_list, cfg)
        json_path = "log-of-{}/epoch_{}.json".format(
            os.path.basename(args.file).split(".")[0], epoch_num)
        all_results = json.dumps(all_results)

        with open(json_path, "w") as fo:
            fo.write(all_results)
        logger.info("Save to %s finished, start evaluation!", json_path)

        eval_gt = COCO(
            os.path.join(args.dataset_dir, cfg.test_dataset["name"],
                         cfg.test_dataset["ann_file"]))
        eval_dt = eval_gt.loadRes(json_path)
        cocoEval = COCOeval(eval_gt, eval_dt, iouType="bbox")
        cocoEval.evaluate()
        cocoEval.accumulate()
        cocoEval.summarize()
        metrics = [
            "AP",
            "[email protected]",
            "[email protected]",
            "APs",
            "APm",
            "APl",
            "AR@1",
            "AR@10",
            "AR@100",
            "ARs",
            "ARm",
            "ARl",
        ]
        logger.info("mmAP".center(32, "-"))
        for i, m in enumerate(metrics):
            logger.info("|\t%s\t|\t%.03f\t|", m, cocoEval.stats[i])
        logger.info("-" * 32)
Beispiel #21
0
def worker(rank, gpu_num, args):
    # using sublinear
    os.environ["MGB_COMP_GRAPH_OPT"] = "enable_sublinear_memory_opt=1;seq_opt.enable_seq_comp_node_opt=0"
    os.environ["MGB_SUBLINEAR_MEMORY_GENETIC_NR_ITER"] = '10'
    os.environ['MGB_CUDA_RESERVE_MEMORY'] = '1'
    # establish the server if is the master

    dist_port = args.port
    if rank == 0:
        dist.Server(port=dist_port)
    if gpu_num> 1:

        dist.init_process_group(
            master_ip="localhost",
            port=dist_port,
            world_size=gpu_num,
            rank=rank,
            device=rank,
        )
        logger.info("Init process group for gpu%d done", rank)

    model = network.Network()
    params = model.parameters(requires_grad=True)
    model.train()

    # Autodiff gradient manager
    gm = autodiff.GradManager().attach(
        model.parameters(),
        callbacks=allreduce_cb,
    )

    opt = optim.SGD(
        params,
        lr=cfg.basic_lr * gpu_num * cfg.batch_per_gpu,
        momentum=cfg.momentum,
        weight_decay=cfg.weight_decay,
    )

    if cfg.pretrain_weight is not None:
        weights = mge.load(cfg.pretrain_weight)
        del weights['fc.weight']
        del weights['fc.bias']
        model.resnet50.load_state_dict(weights)

    start_epoch = 0
    if args.resume_weights is not None:
        assert osp.exists(args.resume_weights)
        model_file = args.resume_weights
        print('Loading {} to initialize FPN...'.format(model_file))
        model_dict = mge.load(model_file)
        start_epoch, weights = model_dict['epoch'] + 1, model_dict['state_dict']
        model.load_state_dict(weights, strict=False)
    
    logger.info("Prepare dataset")
    # train_loader = dataset.train_dataset(rank)

    train_dataset = CrowdHuman(cfg, if_train=True)
    train_sampler = data.Infinite(data.RandomSampler(
        train_dataset, batch_size = cfg.batch_per_gpu, drop_last=True,
        world_size = gpu_num, rank = rank,))
    train_loader = data.DataLoader(
        train_dataset,
        sampler=train_sampler,
        collator = train_dataset,
        num_workers=4,
    )
    
    train_loader = iter(train_loader)
    logger.info("Training...")
    for epoch_id in range(start_epoch, cfg.max_epoch):
        for param_group in opt.param_groups:
            param_group["lr"] = (
                cfg.basic_lr * gpu_num * cfg.batch_per_gpu
                * (cfg.lr_decay_rate ** bisect.bisect_right(cfg.lr_decay_sates, epoch_id))
            )

        max_steps = cfg.nr_images_epoch // (cfg.batch_per_gpu * gpu_num)
        train_one_epoch(model, gm, train_loader, opt, max_steps, rank, epoch_id, gpu_num)
        if rank == 0:
            save_path = osp.join(cfg.model_dir, 'epoch-{}.pkl'.format(epoch_id + 1))
            state_dict = model.state_dict()
            names = [k for k, _ in state_dict.items()]
            for name in names:
                if name.startswith('inputs.'):
                    del state_dict[name]

            mge.save(
                {"epoch": epoch_id, "state_dict": state_dict}, save_path,
            )
            logger.info("dump weights to %s", save_path)
Beispiel #22
0
def main():
    from pycocotools.coco import COCO
    from pycocotools.cocoeval import COCOeval

    parser = make_parser()
    args = parser.parse_args()
    model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0],
                                   cfg.input_shape[1])
    save_dir = os.path.join(args.save_dir, model_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    mge.set_log_file(os.path.join(save_dir, "log.txt"))

    args.ngpus = (dist.helper.get_device_count_by_fork("gpu")
                  if args.ngpus is None else args.ngpus)
    cfg.batch_size = cfg.batch_size if args.batch_size is None else args.batch_size

    dt_path = os.path.join(cfg.data_root, "person_detection_results",
                           args.dt_file)
    dets = json.load(open(dt_path, "r"))

    gt_path = os.path.join(cfg.data_root, "annotations",
                           "person_keypoints_val2017.json")
    eval_gt = COCO(gt_path)
    gt = eval_gt.dataset

    dets = [
        i for i in dets
        if (i["image_id"] in eval_gt.imgs and i["category_id"] == 1)
    ]
    ann_file = {"images": gt["images"], "annotations": dets}

    if args.end_epoch == -1:
        args.end_epoch = args.start_epoch

    for epoch_num in range(args.start_epoch, args.end_epoch + 1,
                           args.test_freq):
        if args.model:
            model_file = args.model
        else:
            model_file = "{}/epoch_{}.pkl".format(args.model_dir, epoch_num)
        logger.info("Load Model : %s completed", model_file)

        all_results = list()

        result_queue = Queue(5000)
        procs = []
        for i in range(args.ngpus):

            master_ip = "localhost"
            port = dist.get_free_ports(1)[0]
            dist.Server(port)

            proc = Process(
                target=worker,
                args=(
                    args.arch,
                    model_file,
                    cfg.data_root,
                    ann_file,
                    master_ip,
                    port,
                    i,
                    args.ngpus,
                    result_queue,
                ),
            )
            proc.start()
            procs.append(proc)

        for _ in tqdm(range(len(dets))):
            all_results.append(result_queue.get())
        for p in procs:
            p.join()

        json_name = "log-of-{}_epoch_{}.json".format(args.arch, epoch_num)
        json_path = os.path.join(save_dir, json_name)
        all_results = json.dumps(all_results)
        with open(json_path, "w") as fo:
            fo.write(all_results)
        logger.info("Save to %s finished, start evaluation!", json_path)

        eval_dt = eval_gt.loadRes(json_path)
        cocoEval = COCOeval(eval_gt, eval_dt, iouType="keypoints")
        cocoEval.evaluate()
        cocoEval.accumulate()
        cocoEval.summarize()
        metrics = [
            "AP",
            "[email protected]",
            "[email protected]",
            "APm",
            "APl",
            "AR",
            "[email protected]",
            "[email protected]",
            "ARm",
            "ARl",
        ]
        logger.info("mmAP".center(32, "-"))
        for i, m in enumerate(metrics):
            logger.info("|\t%s\t|\t%.03f\t|", m, cocoEval.stats[i])
        logger.info("-" * 32)
Beispiel #23
0
def main():
    # pylint: disable=import-outside-toplevel,too-many-branches,too-many-statements
    from pycocotools.coco import COCO
    from pycocotools.cocoeval import COCOeval

    parser = make_parser()
    args = parser.parse_args()

    current_network = import_from_file(args.file)
    cfg = current_network.Cfg()

    # if args.weight_file:
    if not args.weight_dir:
        args.start_epoch = args.end_epoch = -1
    else:
        if args.start_epoch == -1:
            args.start_epoch = cfg.max_epoch - 1
        if args.end_epoch == -1:
            args.end_epoch = args.start_epoch
        assert 0 <= args.start_epoch <= args.end_epoch < cfg.max_epoch

    for epoch_num in range(args.start_epoch, args.end_epoch + 1):
        # if args.weight_file:
        #     weight_file = args.weight_file
        # else:
        #     weight_file = "log-of-{}/epoch_{}.pkl".format(
        #         os.path.basename(args.file).split(".")[0], epoch_num
        #     )
        if args.weight_dir:
            weight_dir = args.weight_dir
        else:
            weight_dir = "train_log/baseline"
        weight_file = os.path.join(args.weight_dir, "epoch_{}.pkl".format(epoch_num))

        if args.ngpus > 1:
            master_ip = "localhost"
            port = dist.get_free_ports(1)[0]
            dist.Server(port)

            result_list = []
            result_queue = Queue(2000)
            procs = []
            for i in range(args.ngpus):
                proc = Process(
                    target=worker,
                    args=(
                        current_network,
                        weight_file,
                        args.dataset_dir,
                        master_ip,
                        port,
                        args.ngpus,
                        i,
                        result_queue,
                    ),
                )
                proc.start()
                procs.append(proc)

            num_imgs = dict(coco=5000, cocomini=5000, objects365=30000)

            for _ in tqdm(range(num_imgs[cfg.test_dataset["name"]])):
                result_list.append(result_queue.get())
            for p in procs:
                p.join()
        else:
            result_list = []

            worker(
                current_network, weight_file, args.dataset_dir,
                None, None, 1, 0, result_list
            )

        total_time = sum([x["perf_time"] for x in result_list])
        average_time = total_time / len(result_list)
        fps = 1.0 / average_time
        logger.info(
            "average inference speed: {:.4}s / iter, fps:{:.3}".format(average_time, fps)
        )

        all_results = DetEvaluator.format(result_list, cfg)
        # json_path = "log-of-{}/epoch_{}.json".format(
        #     os.path.basename(args.file).split(".")[0], epoch_num
        # )
        json_path = os.path.join(args.weight_dir, "epoch_{}.json".format(epoch_num))

        all_results = json.dumps(all_results)

        with open(json_path, "w") as fo:
            fo.write(all_results)
        logger.info("Save to %s finished, start evaluation!", json_path)

        eval_gt = COCO(
            os.path.join(
                args.dataset_dir, cfg.test_dataset["name"], cfg.test_dataset["ann_file"]
            )
        )
        eval_dt = eval_gt.loadRes(json_path)
        cocoEval = COCOeval(eval_gt, eval_dt, iouType="bbox")
        cocoEval.evaluate()
        cocoEval.accumulate()
        cocoEval.summarize()
        metrics = [
            "AP",
            "[email protected]",
            "[email protected]",
            "APs",
            "APm",
            "APl",
            "AR@1",
            "AR@10",
            "AR@100",
            "ARs",
            "ARm",
            "ARl",
        ]
        logger.info("mmAP".center(32, "-"))
        for i, m in enumerate(metrics):
            logger.info("|\t%s\t|\t%.03f\t|", m, cocoEval.stats[i])
        logger.info("-" * 32)
Beispiel #24
0
def test_syncbn():
    nr_chan = 8
    data_shape = (3, nr_chan, 4, 16)
    momentum = 0.9
    eps = 1e-5
    running_mean = np.zeros((1, nr_chan, 1, 1), dtype=np.float32)
    running_var = np.ones((1, nr_chan, 1, 1), dtype=np.float32)
    steps = 4
    nr_ranks = 2
    server = dist.Server(0)
    port = server.py_server_port

    def worker(rank, data, yv_expect, running_mean, running_var):
        if mge.get_device_count("gpu") < nr_ranks:
            return
        dist.init_process_group("localhost", port, nr_ranks, rank, rank)
        bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps)
        for i in range(steps):
            yv = bn(Tensor(data[i]))

        _assert_allclose(yv.numpy(), yv_expect)
        _assert_allclose(bn.running_mean.numpy(), running_mean)
        _assert_allclose(bn.running_var.numpy(), running_var)

    xv = []
    for i in range(steps):
        xv.append(
            np.random.normal(loc=2.3, size=data_shape).astype(np.float32))
        xv_transposed = np.transpose(xv[i], [0, 2, 3, 1]).reshape(
            (data_shape[0] * data_shape[2] * data_shape[3], nr_chan))

        mean = np.mean(xv_transposed, axis=0).reshape(1, nr_chan, 1, 1)

        var_biased = np.var(xv_transposed, axis=0).reshape((1, nr_chan, 1, 1))
        sd = np.sqrt(var_biased + eps)

        var_unbiased = np.var(xv_transposed, axis=0, ddof=1).reshape(
            (1, nr_chan, 1, 1))
        running_mean = running_mean * momentum + mean * (1 - momentum)
        running_var = running_var * momentum + var_unbiased * (1 - momentum)

        yv_expect = (xv[i] - mean) / sd

    data = []
    for i in range(nr_ranks):
        data.append([])
        for j in range(steps):
            data[i].append(xv[j][:, :, :, i * 8:i * 8 + 8])

    procs = []
    for rank in range(nr_ranks):
        p = mp.Process(
            target=worker,
            args=(
                rank,
                data[rank],
                yv_expect[:, :, :, rank * 8:rank * 8 + 8],
                running_mean,
                running_var,
            ),
        )
        p.start()
        procs.append(p)

    for p in procs:
        p.join(10)
        assert p.exitcode == 0
Beispiel #25
0
def main():
    parser = argparse.ArgumentParser(description="MegEngine ImageNet Training")
    parser.add_argument("-d",
                        "--data",
                        metavar="DIR",
                        help="path to imagenet dataset")
    parser.add_argument(
        "-a",
        "--arch",
        default="resnet50",
        help="model architecture (default: resnet50)",
    )
    parser.add_argument(
        "-n",
        "--ngpus",
        default=None,
        type=int,
        help="number of GPUs per node (default: None, use all available GPUs)",
    )
    parser.add_argument(
        "--save",
        metavar="DIR",
        default="output",
        help="path to save checkpoint and log",
    )
    parser.add_argument(
        "--epochs",
        default=90,
        type=int,
        help="number of total epochs to run (default: 90)",
    )
    parser.add_argument(
        "-b",
        "--batch-size",
        metavar="SIZE",
        default=64,
        type=int,
        help="batch size for single GPU (default: 64)",
    )
    parser.add_argument(
        "--lr",
        "--learning-rate",
        metavar="LR",
        default=0.025,
        type=float,
        help="learning rate for single GPU (default: 0.025)",
    )
    parser.add_argument("--momentum",
                        default=0.9,
                        type=float,
                        help="momentum (default: 0.9)")
    parser.add_argument("--weight-decay",
                        default=1e-4,
                        type=float,
                        help="weight decay (default: 0.9)")

    parser.add_argument("-j", "--workers", default=2, type=int)
    parser.add_argument(
        "-p",
        "--print-freq",
        default=20,
        type=int,
        metavar="N",
        help="print frequency (default: 10)",
    )

    parser.add_argument("--dist-addr", default="localhost")
    parser.add_argument("--dist-port", default=23456, type=int)
    parser.add_argument("--world-size", default=1, type=int)
    parser.add_argument("--rank", default=0, type=int)

    args = parser.parse_args()

    # create server if is master
    if args.rank <= 0:
        server = dist.Server(port=args.dist_port)  # pylint: disable=unused-variable  # noqa: F841

    # get device count
    with multiprocessing.Pool(1) as pool:
        ngpus_per_node, _ = pool.map(megengine.get_device_count,
                                     ["gpu", "cpu"])
    if args.ngpus:
        ngpus_per_node = args.ngpus

    # launch processes
    procs = []
    for local_rank in range(ngpus_per_node):
        p = multiprocessing.Process(
            target=worker,
            kwargs=dict(
                rank=args.rank * ngpus_per_node + local_rank,
                world_size=args.world_size * ngpus_per_node,
                ngpus_per_node=ngpus_per_node,
                args=args,
            ),
        )
        p.start()
        procs.append(p)

    # join processes
    for p in procs:
        p.join()
Beispiel #26
0
def main():
    parser = argparse.ArgumentParser(description="MegEngine ImageNet Training")
    parser.add_argument("-d",
                        "--data",
                        metavar="DIR",
                        help="path to imagenet dataset")
    parser.add_argument(
        "-a",
        "--arch",
        default="resnet50",
        help="model architecture (default: resnet50)",
    )
    parser.add_argument(
        "-n",
        "--ngpus",
        default=None,
        type=int,
        help="number of GPUs per node (default: None, use all available GPUs)",
    )
    parser.add_argument("-m",
                        "--model",
                        metavar="PKL",
                        default=None,
                        help="path to model checkpoint")

    parser.add_argument("-j", "--workers", default=2, type=int)
    parser.add_argument(
        "-p",
        "--print-freq",
        default=20,
        type=int,
        metavar="N",
        help="print frequency (default: 10)",
    )

    parser.add_argument("--dist-addr", default="localhost")
    parser.add_argument("--dist-port", default=23456, type=int)
    parser.add_argument("--world-size", default=1, type=int)
    parser.add_argument("--rank", default=0, type=int)

    args = parser.parse_args()

    # create server if is master
    if args.rank <= 0:
        server = dist.Server(port=args.dist_port)  # pylint: disable=unused-variable  # noqa: F841

    # get device count
    with multiprocessing.Pool(1) as pool:
        ngpus_per_node, _ = pool.map(megengine.get_device_count,
                                     ["gpu", "cpu"])
    if args.ngpus:
        ngpus_per_node = args.ngpus

    # launch processes
    procs = []
    for local_rank in range(ngpus_per_node):
        p = multiprocessing.Process(
            target=worker,
            kwargs=dict(
                rank=args.rank * ngpus_per_node + local_rank,
                world_size=args.world_size * ngpus_per_node,
                ngpus_per_node=ngpus_per_node,
                args=args,
            ),
        )
        p.start()
        procs.append(p)

    # join processes
    for p in procs:
        p.join()