def _setup_devices(self) -> "torch.device": logger.info("PyTorch: setting up devices") if self.no_cuda: device = torch.device("cpu") self._n_gpu = 0 elif is_torch_tpu_available(): device = xm.xla_device() self._n_gpu = 0 elif is_sagemaker_mp_enabled(): local_rank = smp.local_rank() device = torch.device("cuda", local_rank) self._n_gpu = 1 elif is_sagemaker_dp_enabled(): sm_dist.init_process_group() self.local_rank = sm_dist.get_local_rank() device = torch.device("cuda", self.local_rank) self._n_gpu = 1 elif self.deepspeed: # deepspeed performs its own DDP internally, and requires the program to be started with: # deepspeed ./program.py # rather than: # python -m torch.distributed.launch --nproc_per_node=2 ./program.py from .integrations import is_deepspeed_available if not is_deepspeed_available(): raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.") import deepspeed deepspeed.init_distributed() # workaround for setups like notebooks where the launcher can't be used, # but deepspeed requires a dist env. # env LOCAL_RANK could be set manually by the user, or via init_distributed if mpi4py is installed self.local_rank = int(os.environ.get("LOCAL_RANK", "-1")) device = torch.device("cuda", self.local_rank) self._n_gpu = 1 elif self.local_rank == -1: # if n_gpu is > 1 we'll use nn.DataParallel. # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will # trigger an error that a device index is missing. Index 0 takes into account the # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0` # will use the first GPU in that env, i.e. GPU#1 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at # the default value. self._n_gpu = torch.cuda.device_count() else: # Here, we'll use torch.distributed. # Initializes the distributed backend which will take care of synchronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") device = torch.device("cuda", self.local_rank) self._n_gpu = 1 if device.type == "cuda": torch.cuda.set_device(device) return device
def dist_init(fn, args): if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True if cudnn.deterministic: warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') args.is_distributed = len(args.hosts) > 1 and args.backend is not None args.is_multigpus = args.num_gpus > 1 args.multigpus_distributed = (args.is_distributed or args.is_multigpus) logger.debug("multigpus_distributed - {}".format( args.multigpus_distributed)) logger.debug("Number of gpus available - {}".format(args.num_gpus)) # print("######### Start Training #########") if args.multigpus_distributed: if args.apex: # Initialize the distributed environment. mp.spawn(fn, nprocs=args.num_gpus, args=(args, )) else: if args.data_parallel and not sdp.is_initialized(): sdp.init_process_group() elif args.model_parallel and not smp.is_initialized(): smp.init() fn(None, args) if args.model_parallel: smp.barrier() else: fn(0, args)
def _setup_devices(self) -> "torch.device": logger.info("PyTorch: setting up devices") if self.no_cuda: device = torch.device("cpu") self._n_gpu = 0 elif is_smdistributed_available() and self.mp_parameters != "": # smp.init() local_rank = smp.local_rank() device = torch.device("cuda", local_rank) self._n_gpu = 1 elif is_sagemaker_distributed_available(): import smdistributed.dataparallel.torch.distributed as dist dist.init_process_group() self.local_rank = dist.get_local_rank() device = torch.device("cuda", self.local_rank) self._n_gpu = 1 elif self.local_rank == -1: # if n_gpu is > 1 we'll use nn.DataParallel. # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will # trigger an error that a device index is missing. Index 0 takes into account the # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0` # will use the first GPU in that env, i.e. GPU#1 device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at # the default value. self._n_gpu = torch.cuda.device_count() else: # Here, we'll use torch.distributed. # Initializes the distributed backend which will take care of synchronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") device = torch.device("cuda", self.local_rank) self._n_gpu = 1 if device.type == "cuda": torch.cuda.set_device(device) return device
from six.moves import urllib opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] urllib.request.install_opener(opener) import argparse import time import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torchvision import datasets, transforms from torch.optim.lr_scheduler import StepLR from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP import smdistributed.dataparallel.torch.distributed as dist dist.init_process_group() datasets.MNIST.resources = [ ('https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz', 'f68b3c2dcbeaaa9fbdd348bbdeb94873'), ('https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz', 'd53e105ee54ea40749a09fcbcd1e9432'), ('https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz', '9fb629c4189551a2d022fa330f9573f3'), ('https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz', 'ec29112dd5afa0611ce80d1b7f02629c') ] class Net(nn.Module): def __init__(self):
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import time from argparse import ArgumentParser import torch import numpy as np from torch.optim.lr_scheduler import MultiStepLR import smdistributed.dataparallel.torch.distributed as herring if not herring.is_initialized(): herring.init_process_group() import torch.utils.data.distributed from src.model import SSD300, ResNet, Loss from src.utils import dboxes300_coco, Encoder from src.logger import Logger, BenchLogger from src.evaluate import evaluate from src.train import train_loop, tencent_trick, load_checkpoint, benchmark_train_loop, benchmark_inference_loop from src.data import get_train_loader, get_val_dataset, get_val_dataloader, get_coco_ground_truth import dllogger as DLLogger # Apex imports try: from apex.parallel.LARC import LARC
import time import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torchvision from packaging.version import Version from torchvision import datasets, transforms from torch.optim.lr_scheduler import StepLR TORCH_VERSION = torch.__version__ if Version(TORCH_VERSION) < Version("1.10"): from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP import smdistributed.dataparallel.torch.distributed as dist dist.init_process_group() else: from torch.nn.parallel import DistributedDataParallel as DDP import torch.distributed as dist import smdistributed.dataparallel.torch.torch_smddp # set default instance type to p3.16 if "SAGEMAKER_INSTANCE_TYPE" not in os.environ: os.environ['SAGEMAKER_INSTANCE_TYPE'] = 'ml.p3.16xlarge' dist.init_process_group(backend='smddp') # from torchvision 0.9.1, 2 candidate mirror website links will be added before "resources" items automatically # Reference PR: https://github.com/pytorch/vision/pull/3559 TORCHVISION_VERSION = "0.9.1" if Version(torchvision.__version__) < Version(TORCHVISION_VERSION):
import torch.nn.functional as F import torch.optim as optim import torch.utils.data import torch.utils.data.distributed from torchvision import datasets, transforms ######################################################## ####### 1. SageMaker Distributed Data Parallel ######## ####### - Import Package and Initialization ######## ######################################################## import smdistributed.dataparallel.torch.distributed as smdp from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as smdpDDP if not smdp.is_initialized(): smdp.init_process_group() ####################################################### logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) logger.addHandler(logging.StreamHandler(sys.stdout)) # Based on https://github.com/pytorch/examples/blob/master/mnist/main.py class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) self.conv2 = nn.Conv2d(10, 20, kernel_size=5) self.conv2_drop = nn.Dropout2d()