def main(): parser = make_parser() args = parser.parse_args() # ------------------------ begin training -------------------------- # valid_nr_dev = mge.get_device_count("gpu") if args.ngpus == -1: world_size = valid_nr_dev else: if args.ngpus > valid_nr_dev: logger.error("do not have enough gpus for training") sys.exit(1) else: world_size = args.ngpus logger.info("Device Count = %d", world_size) log_dir = "log-of-{}".format(os.path.basename(args.file).split(".")[0]) if not os.path.isdir(log_dir): os.makedirs(log_dir) if world_size > 1: mp.set_start_method("spawn") processes = list() for i in range(world_size): process = mp.Process(target=worker, args=(i, world_size, args)) process.start() processes.append(process) for p in processes: p.join() else: worker(0, 1, args)
def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.gpus = args.gpus cfg.dynamic = args.dynamic if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' if args.resume_from is not None: cfg.resume_from = args.resume_from cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) # init the logger log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # log some basic info logger.info('training gpus num: {}'.format(args.gpus)) logger.info('Config:\n{}'.format(cfg.text)) # get world_size world_size = args.gpus assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') else: gpuid = args.gpuid mge.set_default_device(device='gpu' + gpuid) if world_size > 1: # scale learning rate by number of gpus is_dict_of_dict = True for _, cfg_ in cfg.optimizers.items(): if not isinstance(cfg_, dict): is_dict_of_dict = False if is_dict_of_dict: for _, cfg_ in cfg.optimizers.items(): cfg_['lr'] = cfg_['lr'] * world_size else: raise RuntimeError( "please use 'dict of dict' style for optimizers config") # start distributed training, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, cfg)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, cfg)
def worker(rank, data, expect, port): if mge.get_device_count("gpu") < world_size: return dist.init_process_group("localhost", port, world_size, rank, rank) inp = tensor(data) output = broadcast(inp) assert np.allclose(output.numpy(), expect)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--arch", default="shufflenet_v2_x1_0", type=str) parser.add_argument("-d", "--data", default=None, type=str) parser.add_argument("-m", "--model", default=None, type=str) parser.add_argument("-n", "--ngpus", default=None, type=int) parser.add_argument("-w", "--workers", default=4, type=int) parser.add_argument("--report-freq", default=50, type=int) args = parser.parse_args() world_size = mge.get_device_count( "gpu") if args.ngpus is None else args.ngpus if world_size > 1: # start distributed training, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, args)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, args)
def train(args): # ------------------------ begin training -------------------------- # valid_nr_dev = mge.get_device_count("gpu") gpu_num = min(valid_nr_dev, args.num_gpus) assert gpu_num > 0 logger.info('Device Count: {}'.format(gpu_num)) ensure_dir(cfg.model_dir) if not osp.exists('output'): os.symlink(cfg.output_dir,'output') if gpu_num > 1: args.port =find_free_port() mp.set_start_method("spawn") processes = list() for i in range(gpu_num): process = mp.Process(target=worker, args=(i, gpu_num, args)) process.start() processes.append(process) for p in processes: p.join() else: worker(0, 1, args)
def worker(rank, data, backend, expect, port_queue): if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = Parameter(data) dist.functional.bcast_param(inp) assert np.allclose(inp.numpy(), expect)
def worker(rank, data, backend, expect, port_queue): if mge.get_device_count("gpu") < world_size: return _init_process_group_wrapper(world_size, rank, rank, backend, port_queue) inp = tensor(data) output = dist.functional.all_reduce_min(inp) assert np.allclose(output.numpy(), expect)
def train(args): # ------------------------ begin training -------------------------- # valid_nr_dev = mge.get_device_count("gpu") if args.divice_num == -1: gpu_num = valid_nr_dev else: if args.divice_num > valid_nr_dev: logger.error("do not have enough gpus for training") sys.exit(1) else: gpu_num = args.divice_num logger.info("Device Count = %d", gpu_num) model_dir = cfg.model_dir if not os.path.isdir(model_dir): os.makedirs(model_dir) if gpu_num > 1: mp.set_start_method("spawn") processes = list() for i in range(gpu_num): process = mp.Process(target=worker, args=(i, gpu_num, args)) process.start() processes.append(process) for p in processes: p.join() else: worker(0, 1, args)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-a", "--arch", default="simplebaseline_res50", type=str, choices=[ "simplebaseline_res50", "simplebaseline_res101", "simplebaseline_res152", ], ) parser.add_argument("--pretrained", default=True, type=bool) parser.add_argument("-s", "--save", default="/data/models", type=str) parser.add_argument("--data_root", default="/data/coco/images/", type=str) parser.add_argument( "--ann_file", default="/data/coco/annotations/person_keypoints_train2017.json", type=str, ) parser.add_argument("--continue", default=None, type=str) parser.add_argument("-b", "--batch_size", default=64, type=int) parser.add_argument("--lr", default=6e-4, type=float) parser.add_argument("--epochs", default=200, type=int) parser.add_argument("--multi_scale_supervision", default=True, type=bool) parser.add_argument("-n", "--ngpus", default=8, type=int) parser.add_argument("-w", "--workers", default=8, type=int) parser.add_argument("--report-freq", default=10, type=int) args = parser.parse_args() model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) world_size = mge.get_device_count( "gpu") if args.ngpus is None else args.ngpus if world_size > 1: # scale learning rate by number of gpus args.lr *= world_size # start distributed training, dispatch sub-processes processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, args)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, args)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-a", "--arch", default="simplebaseline_res50", type=str, choices=cfg.model_choices, ) parser.add_argument("-s", "--save", default="/data/models", type=str) parser.add_argument("-b", "--batch_size", default=32, type=int) parser.add_argument("-lr", "--initial_lr", default=3e-4, type=float) parser.add_argument("--resume", default=None, type=str) parser.add_argument("--multi_scale_supervision", action="store_true") parser.add_argument("-n", "--ngpus", default=8, type=int) parser.add_argument("-w", "--workers", default=8, type=int) args = parser.parse_args() model_name = "{}_{}x{}".format(args.arch, cfg.input_shape[0], cfg.input_shape[1]) save_dir = os.path.join(args.save, model_name) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) if args.batch_size != cfg.batch_size: cfg.batch_size = args.batch_size if args.initial_lr != cfg.initial_lr: cfg.initial_lr = args.initial_lr world_size = mge.get_device_count("gpu") if args.ngpus is None else args.ngpus if world_size > 1: # scale learning rate by number of gpus master_ip = "localhost" port = dist.get_free_ports(1)[0] dist.Server(port) cfg.weight_decay *= world_size # start distributed training, dispatch sub-processes processes = [] for rank in range(world_size): p = mp.Process( target=worker, args=(master_ip, port, rank, world_size, args) ) p.start() processes.append(p) for p in processes: p.join() else: worker(None, None, 0, 1, args)
def worker(rank, data, yv_expect, running_mean, running_var): if mge.get_device_count("gpu") < nr_ranks: return dist.init_process_group("localhost", port, nr_ranks, rank, rank) bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps) for i in range(steps): yv = bn(Tensor(data[i])) _assert_allclose(yv.numpy(), yv_expect) _assert_allclose(bn.running_mean.numpy(), running_mean) _assert_allclose(bn.running_var.numpy(), running_var)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-a", "--arch", default="resnet50", type=str, choices=[ "resnet18", "resnet34", "resnet50", "resnet101", "resnet152", "resnext50_32x4d", "resnext101_32x8d", ], ) parser.add_argument("-d", "--data", default=None, type=str) parser.add_argument("-s", "--save", default="/data/models", type=str) parser.add_argument("-b", "--batch-size", default=32, type=int) parser.add_argument("--learning-rate", default=0.0125, type=float) parser.add_argument("--momentum", default=0.9, type=float) parser.add_argument("--weight-decay", default=1e-4, type=float) parser.add_argument("--epochs", default=90, type=int) parser.add_argument("-n", "--ngpus", default=None, type=int) parser.add_argument("-w", "--workers", default=4, type=int) parser.add_argument("--report-freq", default=50, type=int) args = parser.parse_args() save_dir = os.path.join(args.save, args.arch) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) world_size = mge.get_device_count( "gpu") if args.ngpus is None else args.ngpus if world_size > 1: # scale learning rate by number of gpus args.learning_rate *= world_size # start distributed training, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, args)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, args)
def worker(rank): if mge.get_device_count("gpu") < world_size: return if rank == 0: # remote send dist.init_process_group("localhost", port, world_size, rank, rank) x = Tensor(val, device="gpu0") y = remote_send(x, 1) assert y.numpy()[0] == 0 else: # remote recv dist.init_process_group("localhost", port, world_size, rank, rank) y = remote_recv(0, val.shape, val.dtype) assert y.device == "gpu1" np.testing.assert_almost_equal(val, y.numpy())
def worker(rank, data, yv_expect, running_mean, running_var): if mge.get_device_count("gpu") < nr_ranks: return dist.init_process_group("localhost", 2333, nr_ranks, rank, rank) bn = SyncBatchNorm(nr_chan, momentum=momentum, eps=eps) data_tensor = tensor() for i in range(steps): data_tensor.set_value(data[i]) yv = bn(data_tensor) assertTensorClose(yv_expect, yv.numpy(), max_err=5e-6) assertTensorClose(running_mean, bn.running_mean.numpy(), max_err=5e-6) assertTensorClose(running_var, bn.running_var.numpy(), max_err=5e-6)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--arch", default="shufflenet_v2_x0_5", type=str) parser.add_argument("-d", "--data", default=None, type=str) parser.add_argument("-s", "--save", default="./models", type=str) parser.add_argument("-m", "--model", default=None, type=str) parser.add_argument('-o', '--output', type=str, required=True, help='set path for checkpoints \w tensorboard') parser.add_argument("-b", "--batch-size", default=128, type=int) parser.add_argument("--learning-rate", default=0.0625, type=float) parser.add_argument("--momentum", default=0.9, type=float) parser.add_argument("--weight-decay", default=4e-5, type=float) parser.add_argument("--steps", default=300000, type=int) parser.add_argument("-n", "--ngpus", default=None, type=int) parser.add_argument("-w", "--workers", default=4, type=int) parser.add_argument("--report-freq", default=50, type=int) args = parser.parse_args() world_size = mge.get_device_count( "gpu") if args.ngpus is None else args.ngpus save_dir = os.path.join(args.save, args.arch, "b{}".format(args.batch_size * world_size)) if not os.path.exists(save_dir): os.makedirs(save_dir) mge.set_log_file(os.path.join(save_dir, "log.txt")) if not os.path.exists(args.output): os.makedirs(args.output) if world_size > 1: # scale learning rate by number of gpus args.learning_rate *= world_size # start distributed training, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, args)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, args)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--arch", default="resnet18", type=str) parser.add_argument("-d", "--data", default=None, type=str) parser.add_argument("-s", "--save", default="/data/models", type=str) parser.add_argument("-c", "--checkpoint", default=None, type=str, help="pretrained model to finetune") parser.add_argument( "-m", "--mode", default="qat", type=str, choices=["normal", "qat", "quantized"], help="Quantization Mode\n" "normal: no quantization, using float32\n" "qat: quantization aware training, simulate int8\n" "quantized: convert mode to int8 quantized, inference only") parser.add_argument("-n", "--ngpus", default=None, type=int) parser.add_argument("-w", "--workers", default=4, type=int) parser.add_argument("--report-freq", default=50, type=int) args = parser.parse_args() world_size = mge.get_device_count( "gpu") if args.ngpus is None else args.ngpus if args.mode == "quantized": world_size = 1 args.report_freq = 1 # test is slow on cpu mge.set_default_device("cpux") logger.warning("quantized mode use cpu only") if world_size > 1: # start distributed training, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, args)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, args)
def main(): timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) args = parse_args() cfg = Config.fromfile(args.config) cfg.gpus = args.gpus cfg.dynamic = args.dynamic cfg.ensemble = args.ensemble if args.work_dir is not None: cfg.work_dir = args.work_dir else: assert cfg.get( 'work_dir', None ) is not None, 'if do not set work_dir in args, please set in config file' cfg.work_dir = os.path.join(cfg.work_dir, timestamp) mkdir_or_exist(os.path.abspath(cfg.work_dir)) # init the logger log_file = os.path.join(cfg.work_dir, 'root.log') logger = get_root_logger(log_file=log_file, log_level=cfg.log_level) # log some basic info logger.info('test gpus num: {}'.format(args.gpus)) logger.info('Config:\n{}'.format(cfg.text)) # get world_size world_size = args.gpus assert world_size <= mge.get_device_count("gpu") if world_size == 0: # use cpu mge.set_default_device(device='cpux') else: gpuid = args.gpuid mge.set_default_device(device='gpu' + gpuid) if world_size > 1: # start distributed test, dispatch sub-processes mp.set_start_method("spawn") processes = [] for rank in range(world_size): p = mp.Process(target=worker, args=(rank, world_size, cfg)) p.start() processes.append(p) for p in processes: p.join() else: worker(0, 1, cfg)