Example #1
0
def main():
    parser = make_parser()
    args = parser.parse_args()

    # ------------------------ begin training -------------------------- #
    valid_nr_dev = mge.get_device_count("gpu")
    if args.ngpus == -1:
        world_size = valid_nr_dev
    else:
        if args.ngpus > valid_nr_dev:
            logger.error("do not have enough gpus for training")
            sys.exit(1)
        else:
            world_size = args.ngpus

    logger.info("Device Count = %d", world_size)

    log_dir = "log-of-{}".format(os.path.basename(args.file).split(".")[0])
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir)

    if world_size > 1:
        mp.set_start_method("spawn")
        processes = list()
        for i in range(world_size):
            process = mp.Process(target=worker, args=(i, world_size, args))
            process.start()
            processes.append(process)

        for p in processes:
            p.join()
    else:
        worker(0, 1, args)
Example #2
0
def main():
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    args = parse_args()
    cfg = Config.fromfile(args.config)
    cfg.gpus = args.gpus
    cfg.dynamic = args.dynamic
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    else:
        assert cfg.get(
            'work_dir', None
        ) is not None, 'if do not set work_dir in args, please set in config file'
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from

    cfg.work_dir = os.path.join(cfg.work_dir, timestamp)
    mkdir_or_exist(os.path.abspath(cfg.work_dir))

    # init the logger
    log_file = os.path.join(cfg.work_dir, 'root.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # log some basic info
    logger.info('training gpus num: {}'.format(args.gpus))
    logger.info('Config:\n{}'.format(cfg.text))

    # get world_size
    world_size = args.gpus
    assert world_size <= mge.get_device_count("gpu")
    if world_size == 0:  # use cpu
        mge.set_default_device(device='cpux')
    else:
        gpuid = args.gpuid
        mge.set_default_device(device='gpu' + gpuid)

    if world_size > 1:
        # scale learning rate by number of gpus
        is_dict_of_dict = True
        for _, cfg_ in cfg.optimizers.items():
            if not isinstance(cfg_, dict):
                is_dict_of_dict = False
        if is_dict_of_dict:
            for _, cfg_ in cfg.optimizers.items():
                cfg_['lr'] = cfg_['lr'] * world_size
        else:
            raise RuntimeError(
                "please use 'dict of dict' style for optimizers config")

        # start distributed training, dispatch sub-processes
        mp.set_start_method("spawn")
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, cfg))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, cfg)
 def worker(rank, data, expect, port):
     if mge.get_device_count("gpu") < world_size:
         return
     dist.init_process_group("localhost", port, world_size, rank, rank)
     inp = tensor(data)
     output = broadcast(inp)
     assert np.allclose(output.numpy(), expect)
Example #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-a", "--arch", default="shufflenet_v2_x1_0", type=str)
    parser.add_argument("-d", "--data", default=None, type=str)
    parser.add_argument("-m", "--model", default=None, type=str)

    parser.add_argument("-n", "--ngpus", default=None, type=int)
    parser.add_argument("-w", "--workers", default=4, type=int)
    parser.add_argument("--report-freq", default=50, type=int)
    args = parser.parse_args()

    world_size = mge.get_device_count(
        "gpu") if args.ngpus is None else args.ngpus

    if world_size > 1:
        # start distributed training, dispatch sub-processes
        mp.set_start_method("spawn")
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, args))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, args)
Example #5
0
def train(args):
    # ------------------------ begin training -------------------------- #
    valid_nr_dev = mge.get_device_count("gpu")
    gpu_num = min(valid_nr_dev, args.num_gpus)
    assert gpu_num > 0
    logger.info('Device Count: {}'.format(gpu_num))

    ensure_dir(cfg.model_dir)

    if not osp.exists('output'):
        os.symlink(cfg.output_dir,'output')

    if gpu_num > 1:
        args.port =find_free_port()
        mp.set_start_method("spawn")
        processes = list()
        for i in range(gpu_num):
            process = mp.Process(target=worker, args=(i, gpu_num, args))
            process.start()
            processes.append(process)

        for p in processes:
            p.join()
    else:
        worker(0, 1, args)
Example #6
0
 def worker(rank, data, backend, expect, port_queue):
     if mge.get_device_count("gpu") < world_size:
         return
     _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
     inp = Parameter(data)
     dist.functional.bcast_param(inp)
     assert np.allclose(inp.numpy(), expect)
Example #7
0
 def worker(rank, data, backend, expect, port_queue):
     if mge.get_device_count("gpu") < world_size:
         return
     _init_process_group_wrapper(world_size, rank, rank, backend, port_queue)
     inp = tensor(data)
     output = dist.functional.all_reduce_min(inp)
     assert np.allclose(output.numpy(), expect)
Example #8
0
def train(args):
    # ------------------------ begin training -------------------------- #
    valid_nr_dev = mge.get_device_count("gpu")
    if args.divice_num == -1:
        gpu_num = valid_nr_dev
    else:
        if args.divice_num > valid_nr_dev:
            logger.error("do not have enough gpus for training")
            sys.exit(1)
        else:
            gpu_num = args.divice_num

    logger.info("Device Count = %d", gpu_num)

    model_dir = cfg.model_dir
    if not os.path.isdir(model_dir):
        os.makedirs(model_dir)

    if gpu_num > 1:
        mp.set_start_method("spawn")
        processes = list()
        for i in range(gpu_num):
            process = mp.Process(target=worker, args=(i, gpu_num, args))
            process.start()
            processes.append(process)

        for p in processes:
            p.join()
    else:
        worker(0, 1, args)
Example #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-a",
        "--arch",
        default="simplebaseline_res50",
        type=str,
        choices=[
            "simplebaseline_res50",
            "simplebaseline_res101",
            "simplebaseline_res152",
        ],
    )
    parser.add_argument("--pretrained", default=True, type=bool)
    parser.add_argument("-s", "--save", default="/data/models", type=str)
    parser.add_argument("--data_root", default="/data/coco/images/", type=str)
    parser.add_argument(
        "--ann_file",
        default="/data/coco/annotations/person_keypoints_train2017.json",
        type=str,
    )
    parser.add_argument("--continue", default=None, type=str)

    parser.add_argument("-b", "--batch_size", default=64, type=int)
    parser.add_argument("--lr", default=6e-4, type=float)
    parser.add_argument("--epochs", default=200, type=int)

    parser.add_argument("--multi_scale_supervision", default=True, type=bool)

    parser.add_argument("-n", "--ngpus", default=8, type=int)
    parser.add_argument("-w", "--workers", default=8, type=int)
    parser.add_argument("--report-freq", default=10, type=int)

    args = parser.parse_args()

    model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0],
                                   cfg.input_shape[1])
    save_dir = os.path.join(args.save, model_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    mge.set_log_file(os.path.join(save_dir, "log.txt"))

    world_size = mge.get_device_count(
        "gpu") if args.ngpus is None else args.ngpus

    if world_size > 1:
        # scale learning rate by number of gpus
        args.lr *= world_size
        # start distributed training, dispatch sub-processes
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, args))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, args)
Example #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-a",
        "--arch",
        default="simplebaseline_res50",
        type=str,
        choices=cfg.model_choices,
    )
    parser.add_argument("-s", "--save", default="/data/models", type=str)
    parser.add_argument("-b", "--batch_size", default=32, type=int)
    parser.add_argument("-lr", "--initial_lr", default=3e-4, type=float)

    parser.add_argument("--resume", default=None, type=str)

    parser.add_argument("--multi_scale_supervision", action="store_true")

    parser.add_argument("-n", "--ngpus", default=8, type=int)
    parser.add_argument("-w", "--workers", default=8, type=int)

    args = parser.parse_args()

    model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1])
    save_dir = os.path.join(args.save, model_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    mge.set_log_file(os.path.join(save_dir, "log.txt"))

    if args.batch_size != cfg.batch_size:
        cfg.batch_size = args.batch_size
    if args.initial_lr != cfg.initial_lr:
        cfg.initial_lr = args.initial_lr

    world_size = mge.get_device_count("gpu") if args.ngpus is None else args.ngpus

    if world_size > 1:
        # scale learning rate by number of gpus
        master_ip = "localhost"

        port = dist.get_free_ports(1)[0]
        dist.Server(port)

        cfg.weight_decay *= world_size
        # start distributed training, dispatch sub-processes
        processes = []
        for rank in range(world_size):
            p = mp.Process(
                target=worker, args=(master_ip, port, rank, world_size, args)
            )
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(None, None, 0, 1, args)
Example #11
0
    def worker(rank, data, yv_expect, running_mean, running_var):
        if mge.get_device_count("gpu") < nr_ranks:
            return
        dist.init_process_group("localhost", port, nr_ranks, rank, rank)
        bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps)
        for i in range(steps):
            yv = bn(Tensor(data[i]))

        _assert_allclose(yv.numpy(), yv_expect)
        _assert_allclose(bn.running_mean.numpy(), running_mean)
        _assert_allclose(bn.running_var.numpy(), running_var)
Example #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-a",
        "--arch",
        default="resnet50",
        type=str,
        choices=[
            "resnet18",
            "resnet34",
            "resnet50",
            "resnet101",
            "resnet152",
            "resnext50_32x4d",
            "resnext101_32x8d",
        ],
    )
    parser.add_argument("-d", "--data", default=None, type=str)
    parser.add_argument("-s", "--save", default="/data/models", type=str)

    parser.add_argument("-b", "--batch-size", default=32, type=int)
    parser.add_argument("--learning-rate", default=0.0125, type=float)
    parser.add_argument("--momentum", default=0.9, type=float)
    parser.add_argument("--weight-decay", default=1e-4, type=float)
    parser.add_argument("--epochs", default=90, type=int)

    parser.add_argument("-n", "--ngpus", default=None, type=int)
    parser.add_argument("-w", "--workers", default=4, type=int)
    parser.add_argument("--report-freq", default=50, type=int)
    args = parser.parse_args()

    save_dir = os.path.join(args.save, args.arch)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    mge.set_log_file(os.path.join(save_dir, "log.txt"))

    world_size = mge.get_device_count(
        "gpu") if args.ngpus is None else args.ngpus

    if world_size > 1:
        # scale learning rate by number of gpus
        args.learning_rate *= world_size
        # start distributed training, dispatch sub-processes
        mp.set_start_method("spawn")
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, args))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, args)
 def worker(rank):
     if mge.get_device_count("gpu") < world_size:
         return
     if rank == 0:  # remote send
         dist.init_process_group("localhost", port, world_size, rank, rank)
         x = Tensor(val, device="gpu0")
         y = remote_send(x, 1)
         assert y.numpy()[0] == 0
     else:  # remote recv
         dist.init_process_group("localhost", port, world_size, rank, rank)
         y = remote_recv(0, val.shape, val.dtype)
         assert y.device == "gpu1"
         np.testing.assert_almost_equal(val, y.numpy())
Example #14
0
    def worker(rank, data, yv_expect, running_mean, running_var):
        if mge.get_device_count("gpu") < nr_ranks:
            return
        dist.init_process_group("localhost", 2333, nr_ranks, rank, rank)
        bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps)
        data_tensor = tensor()
        for i in range(steps):
            data_tensor.set_value(data[i])
            yv = bn(data_tensor)

        assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6)
        assertTensorClose(running_mean, bn.running_mean.numpy(), max_err=5e-6)
        assertTensorClose(running_var, bn.running_var.numpy(), max_err=5e-6)
Example #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-a", "--arch", default="shufflenet_v2_x0_5", type=str)
    parser.add_argument("-d", "--data", default=None, type=str)
    parser.add_argument("-s", "--save", default="./models", type=str)
    parser.add_argument("-m", "--model", default=None, type=str)
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help='set path for checkpoints \w tensorboard')

    parser.add_argument("-b", "--batch-size", default=128, type=int)
    parser.add_argument("--learning-rate", default=0.0625, type=float)
    parser.add_argument("--momentum", default=0.9, type=float)
    parser.add_argument("--weight-decay", default=4e-5, type=float)
    parser.add_argument("--steps", default=300000, type=int)

    parser.add_argument("-n", "--ngpus", default=None, type=int)
    parser.add_argument("-w", "--workers", default=4, type=int)
    parser.add_argument("--report-freq", default=50, type=int)
    args = parser.parse_args()

    world_size = mge.get_device_count(
        "gpu") if args.ngpus is None else args.ngpus

    save_dir = os.path.join(args.save, args.arch,
                            "b{}".format(args.batch_size * world_size))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    mge.set_log_file(os.path.join(save_dir, "log.txt"))

    if not os.path.exists(args.output):
        os.makedirs(args.output)

    if world_size > 1:
        # scale learning rate by number of gpus
        args.learning_rate *= world_size
        # start distributed training, dispatch sub-processes
        mp.set_start_method("spawn")
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, args))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, args)
Example #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-a", "--arch", default="resnet18", type=str)
    parser.add_argument("-d", "--data", default=None, type=str)
    parser.add_argument("-s", "--save", default="/data/models", type=str)
    parser.add_argument("-c",
                        "--checkpoint",
                        default=None,
                        type=str,
                        help="pretrained model to finetune")

    parser.add_argument(
        "-m",
        "--mode",
        default="qat",
        type=str,
        choices=["normal", "qat", "quantized"],
        help="Quantization Mode\n"
        "normal: no quantization, using float32\n"
        "qat: quantization aware training, simulate int8\n"
        "quantized: convert mode to int8 quantized, inference only")

    parser.add_argument("-n", "--ngpus", default=None, type=int)
    parser.add_argument("-w", "--workers", default=4, type=int)
    parser.add_argument("--report-freq", default=50, type=int)
    args = parser.parse_args()

    world_size = mge.get_device_count(
        "gpu") if args.ngpus is None else args.ngpus

    if args.mode == "quantized":
        world_size = 1
        args.report_freq = 1  # test is slow on cpu
        mge.set_default_device("cpux")
        logger.warning("quantized mode use cpu only")

    if world_size > 1:
        # start distributed training, dispatch sub-processes
        mp.set_start_method("spawn")
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, args))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, args)
Example #17
0
def main():
    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
    args = parse_args()
    cfg = Config.fromfile(args.config)
    cfg.gpus = args.gpus
    cfg.dynamic = args.dynamic
    cfg.ensemble = args.ensemble
    if args.work_dir is not None:
        cfg.work_dir = args.work_dir
    else:
        assert cfg.get(
            'work_dir', None
        ) is not None, 'if do not set work_dir in args, please set in config file'

    cfg.work_dir = os.path.join(cfg.work_dir, timestamp)
    mkdir_or_exist(os.path.abspath(cfg.work_dir))

    # init the logger
    log_file = os.path.join(cfg.work_dir, 'root.log')
    logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)

    # log some basic info
    logger.info('test gpus num: {}'.format(args.gpus))
    logger.info('Config:\n{}'.format(cfg.text))

    # get world_size
    world_size = args.gpus
    assert world_size <= mge.get_device_count("gpu")
    if world_size == 0:  # use cpu
        mge.set_default_device(device='cpux')
    else:
        gpuid = args.gpuid
        mge.set_default_device(device='gpu' + gpuid)

    if world_size > 1:
        # start distributed test, dispatch sub-processes
        mp.set_start_method("spawn")
        processes = []
        for rank in range(world_size):
            p = mp.Process(target=worker, args=(rank, world_size, cfg))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        worker(0, 1, cfg)