Python init_process_groupの例、smdistributed.dataparallel.torch.distributed.init_process_group Pythonの例

コード例 #1

0

ファイルを表示

ファイル: training_args.py プロジェクト: whaozl/transformers

    def _setup_devices(self) -> "torch.device":
        logger.info("PyTorch: setting up devices")
        if self.no_cuda:
            device = torch.device("cpu")
            self._n_gpu = 0
        elif is_torch_tpu_available():
            device = xm.xla_device()
            self._n_gpu = 0
        elif is_sagemaker_mp_enabled():
            local_rank = smp.local_rank()
            device = torch.device("cuda", local_rank)
            self._n_gpu = 1
        elif is_sagemaker_dp_enabled():
            sm_dist.init_process_group()
            self.local_rank = sm_dist.get_local_rank()
            device = torch.device("cuda", self.local_rank)
            self._n_gpu = 1
        elif self.deepspeed:
            # deepspeed performs its own DDP internally, and requires the program to be started with:
            # deepspeed  ./program.py
            # rather than:
            # python -m torch.distributed.launch --nproc_per_node=2 ./program.py
            from .integrations import is_deepspeed_available

            if not is_deepspeed_available():
                raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.")
            import deepspeed

            deepspeed.init_distributed()

            # workaround for setups like notebooks where the launcher can't be used,
            # but deepspeed requires a dist env.
            # env LOCAL_RANK could be set manually by the user, or via init_distributed if mpi4py is installed
            self.local_rank = int(os.environ.get("LOCAL_RANK", "-1"))

            device = torch.device("cuda", self.local_rank)
            self._n_gpu = 1
        elif self.local_rank == -1:
            # if n_gpu is > 1 we'll use nn.DataParallel.
            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
            # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
            # trigger an error that a device index is missing. Index 0 takes into account the
            # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
            # will use the first GPU in that env, i.e. GPU#1
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
            # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
            # the default value.
            self._n_gpu = torch.cuda.device_count()
        else:
            # Here, we'll use torch.distributed.
            # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
            torch.distributed.init_process_group(backend="nccl")
            device = torch.device("cuda", self.local_rank)
            self._n_gpu = 1

        if device.type == "cuda":
            torch.cuda.set_device(device)

        return device

コード例 #2

0

ファイルを表示

def dist_init(fn, args):
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True

        if cudnn.deterministic:
            warnings.warn('You have chosen to seed training. '
                          'This will turn on the CUDNN deterministic setting, '
                          'which can slow down your training considerably! '
                          'You may see unexpected behavior when restarting '
                          'from checkpoints.')

    args.is_distributed = len(args.hosts) > 1 and args.backend is not None
    args.is_multigpus = args.num_gpus > 1
    args.multigpus_distributed = (args.is_distributed or args.is_multigpus)

    logger.debug("multigpus_distributed - {}".format(
        args.multigpus_distributed))
    logger.debug("Number of gpus available - {}".format(args.num_gpus))

    #     print("######### Start Training #########")

    if args.multigpus_distributed:
        if args.apex:
            # Initialize the distributed environment.
            mp.spawn(fn, nprocs=args.num_gpus, args=(args, ))
        else:
            if args.data_parallel and not sdp.is_initialized():
                sdp.init_process_group()
            elif args.model_parallel and not smp.is_initialized():
                smp.init()

            fn(None, args)

            if args.model_parallel:
                smp.barrier()
    else:
        fn(0, args)

コード例 #3

0

ファイルを表示

ファイル: training_args_sm.py プロジェクト: Roshrini/transformers

    def _setup_devices(self) -> "torch.device":
        logger.info("PyTorch: setting up devices")
        if self.no_cuda:
            device = torch.device("cpu")
            self._n_gpu = 0
        elif is_smdistributed_available() and self.mp_parameters != "":
            # smp.init()
            local_rank = smp.local_rank()
            device = torch.device("cuda", local_rank)
            self._n_gpu = 1
        elif is_sagemaker_distributed_available():
            import smdistributed.dataparallel.torch.distributed as dist

            dist.init_process_group()
            self.local_rank = dist.get_local_rank()
            device = torch.device("cuda", self.local_rank)
            self._n_gpu = 1
        elif self.local_rank == -1:
            # if n_gpu is > 1 we'll use nn.DataParallel.
            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
            # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
            # trigger an error that a device index is missing. Index 0 takes into account the
            # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
            # will use the first GPU in that env, i.e. GPU#1
            device = torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu")
            # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
            # the default value.
            self._n_gpu = torch.cuda.device_count()
        else:
            # Here, we'll use torch.distributed.
            # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
            torch.distributed.init_process_group(backend="nccl")
            device = torch.device("cuda", self.local_rank)
            self._n_gpu = 1

        if device.type == "cuda":
            torch.cuda.set_device(device)

        return device

コード例 #4

0

ファイルを表示

ファイル: smdataparallel_mnist.py プロジェクト: gmaria/deep-learning-containers

from six.moves import urllib
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)

import argparse
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP
import smdistributed.dataparallel.torch.distributed as dist
dist.init_process_group()

datasets.MNIST.resources = [
    ('https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz',
     'f68b3c2dcbeaaa9fbdd348bbdeb94873'),
    ('https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz',
     'd53e105ee54ea40749a09fcbcd1e9432'),
    ('https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz',
     '9fb629c4189551a2d022fa330f9573f3'),
    ('https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz',
     'ec29112dd5afa0611ce80d1b7f02629c')
]


class Net(nn.Module):
    def __init__(self):

コード例 #5

0

ファイルを表示

ファイル: main.py プロジェクト: kokuno1122/DeepLearningExamples

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import time
from argparse import ArgumentParser
import torch
import numpy as np
from torch.optim.lr_scheduler import MultiStepLR

import smdistributed.dataparallel.torch.distributed as herring
if not herring.is_initialized():
    herring.init_process_group()

import torch.utils.data.distributed
from src.model import SSD300, ResNet, Loss
from src.utils import dboxes300_coco, Encoder
from src.logger import Logger, BenchLogger
from src.evaluate import evaluate
from src.train import train_loop, tencent_trick, load_checkpoint, benchmark_train_loop, benchmark_inference_loop
from src.data import get_train_loader, get_val_dataset, get_val_dataloader, get_coco_ground_truth

import dllogger as DLLogger


# Apex imports
try:
    from apex.parallel.LARC import LARC

コード例 #6

0

ファイルを表示

ファイル: smdataparallel_mnist.py プロジェクト: saimidu/deep-learning-containers

import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from packaging.version import Version
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR

TORCH_VERSION = torch.__version__

if Version(TORCH_VERSION) < Version("1.10"):
    from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP
    import smdistributed.dataparallel.torch.distributed as dist
    dist.init_process_group()
else:
    from torch.nn.parallel import DistributedDataParallel as DDP
    import torch.distributed as dist
    import smdistributed.dataparallel.torch.torch_smddp

    # set default instance type to p3.16
    if "SAGEMAKER_INSTANCE_TYPE" not in os.environ:
        os.environ['SAGEMAKER_INSTANCE_TYPE'] = 'ml.p3.16xlarge'

    dist.init_process_group(backend='smddp')

# from torchvision 0.9.1, 2 candidate mirror website links will be added before "resources" items automatically
# Reference PR: https://github.com/pytorch/vision/pull/3559
TORCHVISION_VERSION = "0.9.1"
if Version(torchvision.__version__) < Version(TORCHVISION_VERSION):

コード例 #7

0

ファイルを表示

ファイル: pytorch_mnist_smdp.py プロジェクト: Napkin-DL/ml-ws

import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.distributed
from torchvision import datasets, transforms

########################################################
####### 1. SageMaker Distributed Data Parallel  ########
#######  - Import Package and Initialization    ########
########################################################

import smdistributed.dataparallel.torch.distributed as smdp
from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as smdpDDP

if not smdp.is_initialized():
    smdp.init_process_group()

#######################################################

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))


# Based on https://github.com/pytorch/examples/blob/master/mnist/main.py
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()