def test_broadcast():
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank, data, expect, port):
        if mge.get_device_count("gpu") < world_size:
            return
        dist.init_process_group("localhost", port, world_size, rank, rank)
        inp = tensor(data)
        output = broadcast(inp)
        assert np.allclose(output.numpy(), expect)

    def check(shape):
        x = np.random.rand(*shape).astype("float32")
        y = x + 1
        p0 = mp.Process(target=worker, args=(0, x, x, port))
        p1 = mp.Process(target=worker, args=(1, y, x, port))

        p0.start()
        p1.start()

        p0.join(10)
        p1.join(10)

        assert p0.exitcode == 0 and p1.exitcode == 0

    for shape in [(2, 3), (8, 10), (99, 77)]:
        check(shape)
def test_io_remote():
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)
    val = np.random.rand(4, 5).astype(np.float32)

    def worker(rank):
        if mge.get_device_count("gpu") < world_size:
            return
        if rank == 0:  # remote send
            dist.init_process_group("localhost", port, world_size, rank, rank)
            x = Tensor(val, device="gpu0")
            y = remote_send(x, 1)
            assert y.numpy()[0] == 0
        else:  # remote recv
            dist.init_process_group("localhost", port, world_size, rank, rank)
            y = remote_recv(0, val.shape, val.dtype)
            assert y.device == "gpu1"
            np.testing.assert_almost_equal(val, y.numpy())

    procs = []
    for rank in range(world_size):
        p = mp.Process(target=worker, args=(rank, ))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(10)
        assert p.exitcode == 0
Example #3
0
def main():
    parser = make_parser()
    args = parser.parse_args()

    # ------------------------ begin training -------------------------- #
    logger.info("Device Count = %d", args.ngpus)

    log_dir = "log-of-{}".format(os.path.basename(args.file).split(".")[0])
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir)

    if args.ngpus > 1:
        master_ip = "localhost"
        port = dist.get_free_ports(1)[0]
        dist.Server(port)
        processes = list()
        for rank in range(args.ngpus):
            process = mp.Process(
                target=worker, args=(master_ip, port, args.ngpus, rank, args)
            )
            process.start()
            processes.append(process)

        for p in processes:
            p.join()
    else:
        worker(None, None, 1, 0, args)
def run_test(
    model_path, use_jit, use_symbolic, sublinear_memory_config=None, max_err=None,
):

    """
    Load the model with test cases and run the training for one iter.
    The loss and updated weights are compared with reference value to verify the correctness.

    Dump a new file with updated result by calling update_model
    if you think the test fails due to numerical rounding errors instead of bugs.
    Please think twice before you do so.

    """
    checkpoint = mge.load(model_path)
    data = checkpoint["data"]
    label = checkpoint["label"]
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank, max_err):
        dist.init_process_group("localhost", port, p_num, rank, rank)
        net = MnistNet(has_bn=True)
        net.load_state_dict(checkpoint["net_init"])
        lr = checkpoint["sgd_lr"]
        opt = SGD(net.parameters(), lr=lr)

        gm = ad.GradManager().attach(
            net.parameters(), callbacks=[dist.make_allreduce_cb("MEAN", dist.WORLD)]
        )

        # use same data and label for all gpu's
        # such that the result does not depend on number of gpu
        data_train = Tensor(data)
        label_train = Tensor(label)

        loss = train(data_train, label_train, net, opt, gm)

        np.testing.assert_allclose(loss.numpy(), checkpoint["loss"], atol=max_err)

        if dist.get_rank():
            return
        for param, param_ref in zip(
            net.state_dict().items(), checkpoint["net_updated"].items()
        ):
            assert param[0] == param_ref[0]
            if "bn" in param[0]:
                ref = param_ref[1].reshape(param[1].shape)
                np.testing.assert_allclose(param[1], ref, atol=max_err)
            else:
                np.testing.assert_allclose(param[1], param_ref[1], atol=max_err)

    procs = []
    for rank in range(p_num):
        p = mp.Process(target=worker, args=(rank, max_err,))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(20)
        assert p.exitcode == 0
def test_group_barrier():
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank, q):
        dist.init_process_group("localhost", port, world_size, rank, rank)
        dist.group_barrier()
        if rank == 0:
            dist.group_barrier()
            q.put(0)  # to be observed in rank 1
        else:
            _assert_q_empty(q)  # q.put(0) is not executed in rank 0
            dist.group_barrier()
            _assert_q_val(q, 0)  # q.put(0) executed in rank 0

    Q = mp.Queue()
    procs = []
    for rank in range(world_size):
        p = mp.Process(target=worker, args=(rank, Q))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(20)
        assert p.exitcode == 0
def test_init_process_group():
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank, backend):
        dist.init_process_group("localhost", port, world_size, rank, rank,
                                backend)
        assert dist.is_distributed() == True
        assert dist.get_rank() == rank
        assert dist.get_world_size() == world_size
        assert dist.get_backend() == backend

        py_server_addr = dist.get_py_server_addr()
        assert py_server_addr[0] == "localhost"
        assert py_server_addr[1] == port

        mm_server_addr = dist.get_mm_server_addr()
        assert mm_server_addr[0] == "localhost"
        assert mm_server_addr[1] > 0

        assert isinstance(dist.get_client(), dist.Client)

    def check(backend):
        procs = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, backend))
            p.start()
            procs.append(p)

        for p in procs:
            p.join(20)
            assert p.exitcode == 0

    check("nccl")
def test_synchronized():
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    @dist.synchronized
    def func(rank, q):
        q.put(rank)

    def worker(rank, q):
        dist.init_process_group("localhost", port, world_size, rank, rank)
        dist.group_barrier()
        if rank == 0:
            func(0, q)  # q.put(0)
            q.put(2)
        else:
            _assert_q_val(q, 0)  # func executed in rank 0
            _assert_q_empty(q)  # q.put(2) is not executed
            func(1, q)
            _assert_q_val(
                q,
                1)  # func in rank 1 executed earlier than q.put(2) in rank 0
            _assert_q_val(q, 2)  # q.put(2) executed in rank 0

    Q = mp.Queue()
    procs = []
    for rank in range(world_size):
        p = mp.Process(target=worker, args=(rank, Q))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(20)
        assert p.exitcode == 0
Example #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-a",
        "--arch",
        default="simplebaseline_res50",
        type=str,
        choices=cfg.model_choices,
    )
    parser.add_argument("-s", "--save", default="/data/models", type=str)
    parser.add_argument("-b", "--batch_size", default=32, type=int)
    parser.add_argument("-lr", "--initial_lr", default=3e-4, type=float)

    parser.add_argument("--resume", default=None, type=str)

    parser.add_argument("--multi_scale_supervision", action="store_true")

    parser.add_argument("-n", "--ngpus", default=8, type=int)
    parser.add_argument("-w", "--workers", default=8, type=int)

    args = parser.parse_args()

    model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1])
    save_dir = os.path.join(args.save, model_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    mge.set_log_file(os.path.join(save_dir, "log.txt"))

    if args.batch_size != cfg.batch_size:
        cfg.batch_size = args.batch_size
    if args.initial_lr != cfg.initial_lr:
        cfg.initial_lr = args.initial_lr

    world_size = mge.get_device_count("gpu") if args.ngpus is None else args.ngpus

    if world_size > 1:
        # scale learning rate by number of gpus
        master_ip = "localhost"

        port = dist.get_free_ports(1)[0]
        dist.Server(port)

        cfg.weight_decay *= world_size
        # start distributed training, dispatch sub-processes
        processes = []
        for rank in range(world_size):
            p = mp.Process(
                target=worker, args=(master_ip, port, rank, world_size, args)
            )
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(None, None, 0, 1, args)
Example #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-f",
                        "--file",
                        default="net.py",
                        type=str,
                        help="net description file")
    parser.add_argument("-n",
                        "--ngpus",
                        type=int,
                        default=8,
                        help="batch size for training")
    parser.add_argument(
        "-d",
        "--dataset_dir",
        type=str,
        default="/data/datasets",
    )
    parser.add_argument("-r",
                        "--resume",
                        type=str,
                        default=None,
                        help="resume model file")
    args = parser.parse_args()

    # ------------------------ begin training -------------------------- #
    logger.info("Device Count = %d", args.ngpus)

    log_dir = "log-of-{}".format(os.path.basename(args.file).split(".")[0])
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir)

    if args.ngpus > 1:
        master_ip = "localhost"
        port = dist.get_free_ports(1)[0]
        dist.Server(port)
        processes = list()
        for rank in range(args.ngpus):
            process = mp.Process(target=worker,
                                 args=(master_ip, port, args.ngpus, rank,
                                       args))
            process.start()
            processes.append(process)

        for p in processes:
            p.join()
    else:
        worker(None, None, 1, 0, args)
Example #10
0
def test_dist_grad():
    world_size = 2
    x_np = np.random.rand(10).astype("float32")
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker0():
        dist.init_process_group("localhost", port, world_size, 0, 0)
        mge.device.set_default_device("gpu0")
        grad = Grad()

        x = as_tensor(x_np)
        grad.wrt(x, callback=save_to(x))
        # need a placeholder to trace operator
        send_x = remote_send(x, 1)
        recv_x = remote_recv(1, x_np.shape, x_np.dtype, "gpu0")
        y = recv_x * recv_x

        grad([y], [as_tensor(np.ones_like(x_np))])
        np.testing.assert_almost_equal(x.grad.numpy(), x.numpy() * 2)

    def worker1():
        dist.init_process_group("localhost", port, world_size, 1, 1)
        mge.device.set_default_device("gpu1")
        grad = Grad()

        recv_x = remote_recv(0, x_np.shape, x_np.dtype, "gpu1")
        send_x = remote_send(recv_x, 0)

        grad([], [])

        # sync because grad has a send operator
        sync()
        send_x.device._cn._sync_all()

    import multiprocessing as mp

    p0 = mp.Process(target=worker0)
    p1 = mp.Process(target=worker1)
    p0.start()
    p1.start()
    p0.join(10)
    p1.join(10)
    assert p0.exitcode == 0 and p1.exitcode == 0
Example #11
0
def test_user_set_get():
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank):
        dist.init_process_group("localhost", port, world_size, rank, rank)
        # set in race condition
        dist.get_client().user_set("foo", 1)
        # get in race condition
        ret = dist.get_client().user_get("foo")
        assert ret == 1

    procs = []
    for rank in range(world_size):
        p = mp.Process(target=worker, args=(rank, ))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(20)
        assert p.exitcode == 0
Example #12
0
def main(args):
    configs = load_config_from_path(args.config_file)

    num_devices = dist.helper.get_device_count_by_fork("gpu")
    if num_devices > 1:
        # distributed training
        master_ip = "localhost"
        port = dist.get_free_ports(1)[0]
        dist.Server(port)
        processes = []
        for rank in range(num_devices):
            process = mp.Process(target=worker,
                                 args=(master_ip, port, num_devices, rank,
                                       configs))
            process.start()
            processes.append(process)

        for p in processes:
            p.join()
    else:
        # non-distributed training
        worker(None, None, 1, 0, configs)
Example #13
0
def test_sync_min_max_observer():
    x = np.random.rand(6, 3, 3, 3).astype("float32")
    np_min, np_max = x.min(), x.max()
    world_size = 2
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank, slc):
        dist.init_process_group("localhost", port, world_size, rank, rank)
        m = ob.SyncMinMaxObserver()
        y = mge.tensor(x[slc])
        m(y)
        assert m.min_val == np_min and m.max_val == np_max

    procs = []
    for rank in range(world_size):
        slc = slice(rank * 3, (rank + 1) * 3)
        p = mp.Process(target=worker, args=(rank, slc,), daemon=True)
        p.start()
        procs.append(p)
    for p in procs:
        p.join(20)
        assert p.exitcode == 0
Example #14
0
def test_new_group():
    world_size = 3
    ranks = [2, 0]
    port = dist.get_free_ports(1)[0]
    server = dist.Server(port)

    def worker(rank):
        dist.init_process_group("localhost", port, world_size, rank, rank)
        if rank in ranks:
            group = dist.new_group(ranks)
            assert group.size == 2
            assert group.key == "2,0"
            assert group.rank == ranks.index(rank)
            assert group.comp_node == "gpu{}:2".format(rank)

    procs = []
    for rank in range(world_size):
        p = mp.Process(target=worker, args=(rank, ))
        p.start()
        procs.append(p)

    for p in procs:
        p.join(20)
        assert p.exitcode == 0
Example #15
0
def main():
    from pycocotools.coco import COCO
    from pycocotools.cocoeval import COCOeval

    parser = make_parser()
    args = parser.parse_args()
    model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0],
                                   cfg.input_shape[1])
    save_dir = os.path.join(args.save_dir, model_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    mge.set_log_file(os.path.join(save_dir, "log.txt"))

    args.ngpus = (dist.helper.get_device_count_by_fork("gpu")
                  if args.ngpus is None else args.ngpus)
    cfg.batch_size = cfg.batch_size if args.batch_size is None else args.batch_size

    dt_path = os.path.join(cfg.data_root, "person_detection_results",
                           args.dt_file)
    dets = json.load(open(dt_path, "r"))

    gt_path = os.path.join(cfg.data_root, "annotations",
                           "person_keypoints_val2017.json")
    eval_gt = COCO(gt_path)
    gt = eval_gt.dataset

    dets = [
        i for i in dets
        if (i["image_id"] in eval_gt.imgs and i["category_id"] == 1)
    ]
    ann_file = {"images": gt["images"], "annotations": dets}

    if args.end_epoch == -1:
        args.end_epoch = args.start_epoch

    for epoch_num in range(args.start_epoch, args.end_epoch + 1,
                           args.test_freq):
        if args.model:
            model_file = args.model
        else:
            model_file = "{}/epoch_{}.pkl".format(args.model_dir, epoch_num)
        logger.info("Load Model : %s completed", model_file)

        all_results = list()

        result_queue = Queue(5000)
        procs = []
        for i in range(args.ngpus):

            master_ip = "localhost"
            port = dist.get_free_ports(1)[0]
            dist.Server(port)

            proc = Process(
                target=worker,
                args=(
                    args.arch,
                    model_file,
                    cfg.data_root,
                    ann_file,
                    master_ip,
                    port,
                    i,
                    args.ngpus,
                    result_queue,
                ),
            )
            proc.start()
            procs.append(proc)

        for _ in tqdm(range(len(dets))):
            all_results.append(result_queue.get())
        for p in procs:
            p.join()

        json_name = "log-of-{}_epoch_{}.json".format(args.arch, epoch_num)
        json_path = os.path.join(save_dir, json_name)
        all_results = json.dumps(all_results)
        with open(json_path, "w") as fo:
            fo.write(all_results)
        logger.info("Save to %s finished, start evaluation!", json_path)

        eval_dt = eval_gt.loadRes(json_path)
        cocoEval = COCOeval(eval_gt, eval_dt, iouType="keypoints")
        cocoEval.evaluate()
        cocoEval.accumulate()
        cocoEval.summarize()
        metrics = [
            "AP",
            "[email protected]",
            "[email protected]",
            "APm",
            "APl",
            "AR",
            "[email protected]",
            "[email protected]",
            "ARm",
            "ARl",
        ]
        logger.info("mmAP".center(32, "-"))
        for i, m in enumerate(metrics):
            logger.info("|\t%s\t|\t%.03f\t|", m, cocoEval.stats[i])
        logger.info("-" * 32)
Example #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-f",
                        "--file",
                        default="net.py",
                        type=str,
                        help="net description file")
    parser.add_argument(
        "-w",
        "--weight_file",
        default=None,
        type=str,
        help="weights file",
    )
    parser.add_argument(
        "-n",
        "--ngpus",
        default=1,
        type=int,
        help="total number of gpus for testing",
    )
    parser.add_argument(
        "-d",
        "--dataset_dir",
        type=str,
        default="/data/datasets",
    )
    args = parser.parse_args()

    current_network = import_from_file(args.file)
    cfg = current_network.Cfg()

    if args.ngpus > 1:
        master_ip = "localhost"
        port = dist.get_free_ports(1)[0]
        dist.Server(port)

        result_list = []
        result_queue = Queue(500)
        procs = []
        for i in range(args.ngpus):
            proc = Process(
                target=worker,
                args=(
                    current_network,
                    args.weight_file,
                    args.dataset_dir,
                    master_ip,
                    port,
                    args.ngpus,
                    i,
                    result_queue,
                ),
            )
            proc.start()
            procs.append(proc)

        num_imgs = dict(VOC2012=1449, Cityscapes=500)

        for _ in tqdm(range(num_imgs[cfg.dataset])):
            result_list.append(result_queue.get())
        for p in procs:
            p.join()
    else:
        result_list = []

        worker(current_network, args.weight_file, args.dataset_dir, None, None,
               1, 0, result_list)

    if cfg.val_save_path is not None:
        save_results(result_list, cfg.val_save_path, cfg)
    logger.info("Start evaluation!")
    compute_metric(result_list, cfg)
Example #17
0
def main():
    # pylint: disable=import-outside-toplevel,too-many-branches,too-many-statements
    from pycocotools.coco import COCO
    from pycocotools.cocoeval import COCOeval

    parser = make_parser()
    args = parser.parse_args()

    current_network = import_from_file(args.file)
    cfg = current_network.Cfg()

    # if args.weight_file:
    if not args.weight_dir:
        args.start_epoch = args.end_epoch = -1
    else:
        if args.start_epoch == -1:
            args.start_epoch = cfg.max_epoch - 1
        if args.end_epoch == -1:
            args.end_epoch = args.start_epoch
        assert 0 <= args.start_epoch <= args.end_epoch < cfg.max_epoch

    for epoch_num in range(args.start_epoch, args.end_epoch + 1):
        # if args.weight_file:
        #     weight_file = args.weight_file
        # else:
        #     weight_file = "log-of-{}/epoch_{}.pkl".format(
        #         os.path.basename(args.file).split(".")[0], epoch_num
        #     )
        if args.weight_dir:
            weight_dir = args.weight_dir
        else:
            weight_dir = "train_log/baseline"
        weight_file = os.path.join(args.weight_dir, "epoch_{}.pkl".format(epoch_num))

        if args.ngpus > 1:
            master_ip = "localhost"
            port = dist.get_free_ports(1)[0]
            dist.Server(port)

            result_list = []
            result_queue = Queue(2000)
            procs = []
            for i in range(args.ngpus):
                proc = Process(
                    target=worker,
                    args=(
                        current_network,
                        weight_file,
                        args.dataset_dir,
                        master_ip,
                        port,
                        args.ngpus,
                        i,
                        result_queue,
                    ),
                )
                proc.start()
                procs.append(proc)

            num_imgs = dict(coco=5000, cocomini=5000, objects365=30000)

            for _ in tqdm(range(num_imgs[cfg.test_dataset["name"]])):
                result_list.append(result_queue.get())
            for p in procs:
                p.join()
        else:
            result_list = []

            worker(
                current_network, weight_file, args.dataset_dir,
                None, None, 1, 0, result_list
            )

        total_time = sum([x["perf_time"] for x in result_list])
        average_time = total_time / len(result_list)
        fps = 1.0 / average_time
        logger.info(
            "average inference speed: {:.4}s / iter, fps:{:.3}".format(average_time, fps)
        )

        all_results = DetEvaluator.format(result_list, cfg)
        # json_path = "log-of-{}/epoch_{}.json".format(
        #     os.path.basename(args.file).split(".")[0], epoch_num
        # )
        json_path = os.path.join(args.weight_dir, "epoch_{}.json".format(epoch_num))

        all_results = json.dumps(all_results)

        with open(json_path, "w") as fo:
            fo.write(all_results)
        logger.info("Save to %s finished, start evaluation!", json_path)

        eval_gt = COCO(
            os.path.join(
                args.dataset_dir, cfg.test_dataset["name"], cfg.test_dataset["ann_file"]
            )
        )
        eval_dt = eval_gt.loadRes(json_path)
        cocoEval = COCOeval(eval_gt, eval_dt, iouType="bbox")
        cocoEval.evaluate()
        cocoEval.accumulate()
        cocoEval.summarize()
        metrics = [
            "AP",
            "[email protected]",
            "[email protected]",
            "APs",
            "APm",
            "APl",
            "AR@1",
            "AR@10",
            "AR@100",
            "ARs",
            "ARm",
            "ARl",
        ]
        logger.info("mmAP".center(32, "-"))
        for i, m in enumerate(metrics):
            logger.info("|\t%s\t|\t%.03f\t|", m, cocoEval.stats[i])
        logger.info("-" * 32)