def _setup_test_infra(world_rank, world_size): """distributed setup just for testing purposes""" os.environ['RANK'] = str(world_rank) os.environ['WORLD_SIZE'] = str(world_size) os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '29500' set_cuda_device_id(world_rank) dist.init_process_group(backend='nccl', world_size=world_size, rank=world_rank)
def _setup_test_infra(world_rank, world_size): """distributed setup just for testing purposes""" os.environ["RANK"] = str(world_rank) os.environ["WORLD_SIZE"] = str(world_size) os.environ["MASTER_ADDR"] = "127.0.0.1" os.environ["MASTER_PORT"] = "29500" set_cuda_device_id(world_rank) dist.init_process_group(backend="nccl", world_size=world_size, rank=world_rank)
def to_ort_model(self, model, config, args): model_desc = self.gpt2_model_description(config.n_head, config.vocab_size, config.n_embd, config.n_layer, config.n_ctx, args.per_gpu_train_batch_size) learning_rate_description = self.ort_trainer_learning_rate_description( ) def map_optimizer_attributes(name): no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"] no_decay = False for no_decay_key in no_decay_keys: if no_decay_key in name: no_decay = True break if no_decay: return { "alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": args.adam_epsilon } else: return { "alpha": 0.9, "beta": 0.999, "lambda": args.weight_decay, "epsilon": args.adam_epsilon } from onnxruntime.capi._pybind_state import set_cuda_device_id, set_arena_extend_strategy, ArenaExtendStrategy set_arena_extend_strategy(ArenaExtendStrategy.kSameAsRequested) set_cuda_device_id(self.args.local_rank) model = ORTTrainer( model, None, model_desc, "AdamOptimizer", map_optimizer_attributes, learning_rate_description, args.device, gradient_accumulation_steps=args.gradient_accumulation_steps, world_rank=self.args.world_rank, world_size=self.args.world_size, use_mixed_precision=self.args.fp16, allreduce_post_accumulation=True, _opset_version=12) logger.info("****************************Model converted to ORT") return model
def setup_onnxruntime_with_mpi(args): from mpi4py import MPI comm = MPI.COMM_WORLD args.local_rank = comm.Get_rank() args.world_rank = comm.Get_rank() args.world_size = comm.Get_size() torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) args.n_gpu = 1 from onnxruntime.capi._pybind_state import set_cuda_device_id set_cuda_device_id(args.local_rank) return device
def create_ort_training_session_bind_parameters(model, device, world_rank=-1, world_size=1, gradient_accumulation_steps=1): output_name = model.graph.output[0].name ort_parameters = ort.TrainingParameters() ort_parameters.loss_output_name = output_name ort_parameters.use_mixed_precision = False ort_parameters.world_rank = world_rank ort_parameters.world_size = world_size ort_parameters.gradient_accumulation_steps = gradient_accumulation_steps torch_params = {} output_types = {} for output in model.graph.output: output_types[output.name] = output.type.tensor_type for initializer in model.graph.initializer: torch_tensor = torch.nn.Parameter(torch.as_tensor(numpy_helper.to_array(initializer), device=device)) delete_input_with_name(model.graph.input, initializer.name) model.graph.input.extend( [helper.make_tensor_value_info(initializer.name, initializer.data_type, initializer.dims)]) torch_params[initializer.name] = torch_tensor del model.graph.initializer[:] ort_parameters.weights_to_train = set(torch_params.keys()) if device.type == 'cuda' and hasattr(device, "index") and device.index is not None: from onnxruntime.capi._pybind_state import set_cuda_device_id set_cuda_device_id(device.index) session = ort.TrainingSession(model.SerializeToString(), ort_parameters) train_io_binding = session.io_binding() eval_io_binding = session.io_binding() enable_grad_accumulation = gradient_accumulation_steps > 1 for param in torch_params.keys(): torch_tensor = torch_params[param] train_io_binding.bind_input(param, torch_tensor.device.type, get_device_index(torch_tensor.device), dtype_torch_to_numpy(torch_params[param].dtype), list(torch_tensor.size()), torch_tensor.data_ptr()) eval_io_binding.bind_input(param, torch_tensor.device.type, get_device_index(torch_tensor.device), dtype_torch_to_numpy(torch_params[param].dtype), list(torch_tensor.size()), torch_tensor.data_ptr()) device_index = get_device_index(device) create_and_bind_grad_or_grad_accumulate_buffer(train_io_binding, torch_tensor, param, enable_grad_accumulation, device, device_index) return session, train_io_binding, eval_io_binding, output_name, torch_params, output_types
def setup_onnxruntime_with_mpi(args): ''' from mpi4py import MPI comm = MPI.COMM_WORLD has_aml = 'AZ_BATCH_MASTER_NODE' in os.environ.keys() or 'AZ_BATCHAI_MPI_MASTER_NODE' in os.environ.keys() if not has_aml: print('Detected local run') args.local_rank = comm.Get_rank() % torch.cuda.device_count() args.world_rank = comm.Get_rank() args.world_size = comm.Get_size() torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) args.n_gpu = 1 else: print('Detected Azure batch run') set_environment_variables_for_nccl_backend(get_local_size() == get_global_size(), IB = args.use_ib) args.local_rank = get_local_rank() args.local_size = get_local_size() args.world_rank = get_world_rank() args.world_size = get_global_size() print('Local rank: {}'.format(args.local_rank)) print('Local size: {}'.format(args.local_size)) print('World rank: {}'.format(args.world_rank)) print('World size: {}'.format(args.world_size)) print('CUDA device: {}'.format(args.local_rank)) torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) args.n_gpu = 1 torch.distributed.init_process_group(backend='nccl') ''' #device = torch.device("cuda", get_local_rank()) device = torch.device("cuda", args.distributed_rank) from onnxruntime.capi._pybind_state import set_cuda_device_id #set_cuda_device_id(get_local_rank()) set_cuda_device_id(args.distributed_rank) from onnxruntime.capi._pybind_state import set_arena_extend_strategy, ArenaExtendStrategy set_arena_extend_strategy(ArenaExtendStrategy.kSameAsRequested) return device
def test_single_precision_adasum_on_gpu(): # Common setup world_rank = get_mpi_context_world_rank() world_size = get_mpi_context_world_size() set_cuda_device_id(world_rank) device = "cuda:" + str(world_rank) opts = orttrainer.ORTTrainerOptions({ "debug": { "deterministic_compute": True }, "device": { "id": device, }, "distributed": { "world_rank": world_rank, "world_size": world_size, "enable_adasum": True, }, }) _run_adasum_tests(opts)
def test_single_precision_adasum_on_gpu(): # Common setup world_rank = get_mpi_context_world_rank() world_size = get_mpi_context_world_size() set_cuda_device_id(world_rank) device = 'cuda:' + str(world_rank) opts = orttrainer.ORTTrainerOptions({ 'debug': { 'deterministic_compute': True }, 'device': { 'id': device, }, 'distributed': { 'world_rank': world_rank, 'world_size': world_size, 'enable_adasum': True, } }) _run_adasum_tests(opts)
return results if __name__ == "__main__": local_rank = get_mpi_context_local_rank() world_size = get_mpi_context_world_size() if world_size > 1: # mpi launch logger.warning("mpirun launch, local_rank / world_size: %s : % s", local_rank, world_size) # TrainingArguments._setup_devices will call torch.distributed.init_process_group(backend="nccl") # pytorch expects following environment settings (which would be set if launched with torch.distributed.launch). os.environ['RANK'] = str(local_rank) os.environ['WORLD_SIZE'] = str(world_size) os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '29500' from onnxruntime.capi._pybind_state import set_cuda_device_id set_cuda_device_id(local_rank) test = ORTGlueTest() test.setUp() test.local_rank = local_rank test.world_size = world_size test.test_bert_with_mrpc() else: unittest.main()
def main(): #Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') parser.add_argument('--use-ort', action='store_true', default=False, help='to use onnxruntime as training backend') parser.add_argument('--use-ort-trainer', action='store_true', default=False, help='to use onnxruntime as training backend') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) kwargs = {'num_workers': 0, 'pin_memory': True} train_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])), batch_size=args.test_batch_size, shuffle=True, **kwargs) comm = MPI.COMM_WORLD args.local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) if ('OMPI_COMM_WORLD_LOCAL_RANK' in os.environ) else 0 args.world_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) if ('OMPI_COMM_WORLD_RANK' in os.environ) else 0 args.world_size=comm.Get_size() torch.cuda.set_device(args.local_rank) if use_cuda: device = torch.device("cuda", args.local_rank) else: device = torch.device("cpu") args.n_gpu = 1 set_cuda_device_id(args.local_rank) input_size = 784 hidden_size = 500 num_classes = 10 model = NeuralNet(input_size, hidden_size, num_classes) model_desc = mnist_model_description() if args.use_ort_trainer: # use log_interval as gradient accumulate steps trainer = ORTTrainer(model, my_loss, model_desc, "LambOptimizer", None, IODescription('Learning_Rate', [1,], torch.float32), device, 1, None, args.world_rank, args.world_size, use_mixed_precision=False, allreduce_post_accumulation = True) print('\nBuild ort model done.') for epoch in range(1, args.epochs + 1): train_with_trainer(args, trainer, device, train_loader, epoch) import pdb test_with_trainer(args, trainer, device, test_loader) else: model = ORTModel(model, my_loss, model_desc, device, None, args.world_rank, args.world_size) print('\nBuild ort model done.') optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): train_with_model(args, model, device, train_loader, optimizer, epoch)
def main(): # Training settings parser = argparse.ArgumentParser(description="PyTorch MNIST Example") parser.add_argument("--batch-size", type=int, default=64, metavar="N", help="input batch size for training (default: 64)") parser.add_argument("--test-batch-size", type=int, default=1000, metavar="N", help="input batch size for testing (default: 1000)") parser.add_argument("--epochs", type=int, default=10, metavar="N", help="number of epochs to train (default: 10)") parser.add_argument("--lr", type=float, default=0.01, metavar="LR", help="learning rate (default: 0.01)") parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training") parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)") parser.add_argument( "--log-interval", type=int, default=10, metavar="N", help="how many batches to wait before logging training status", ) args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) kwargs = {"num_workers": 0, "pin_memory": True} train_loader = torch.utils.data.DataLoader( datasets.MNIST( "../data", train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ), batch_size=args.batch_size, shuffle=True, **kwargs, ) test_loader = torch.utils.data.DataLoader( datasets.MNIST( "../data", train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ), batch_size=args.test_batch_size, shuffle=True, **kwargs, ) comm = MPI.COMM_WORLD args.local_rank = (int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]) if ("OMPI_COMM_WORLD_LOCAL_RANK" in os.environ) else 0) args.world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) if ( "OMPI_COMM_WORLD_RANK" in os.environ) else 0 args.world_size = comm.Get_size() if use_cuda: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) args.n_gpu = 1 set_cuda_device_id(args.local_rank) else: device = torch.device("cpu") input_size = 784 hidden_size = 500 num_classes = 10 model = NeuralNet(input_size, hidden_size, num_classes) model_desc = mnist_model_description() # use log_interval as gradient accumulate steps trainer = ORTTrainer( model, my_loss, model_desc, "SGDOptimizer", None, IODescription( "Learning_Rate", [ 1, ], torch.float32, ), device, 1, args.world_rank, args.world_size, use_mixed_precision=False, allreduce_post_accumulation=True, ) print("\nBuild ort model done.") for epoch in range(1, args.epochs + 1): train_with_trainer(args, trainer, device, train_loader, epoch) import pdb test_with_trainer(args, trainer, device, test_loader)