def all_reduce_item(value, op='sum'):
    """
    All-reduces single scalar value if distributed is in use
    """
    if dist.is_available() and dist.is_initialized():
        if op == 'sum' or op == 'mean':
            dop = dist.ReduceOp.SUM
        elif op == 'min':
            dop = dist.ReduceOp.MIN
        elif op == 'max':
            dop = dist.ReduceOp.MAX
        elif op == 'product':
            dop = dist.ReduceOp.PRODUCT
        else:
            raise RuntimeError('Unsupported reduce op')

        # backend = dist.get_backend()
        # if backend == dist.Backend.NCCL:
        #     device = torch.device('cuda')
        # elif backend == dist.Backend.GLOO:
        #     device = torch.device('cpu')
        # else:
        #     raise RuntimeError('Unsupported distributed backend')

        device = torch.device('cuda')
        tensor = torch.tensor(value, device=device)
        dist.all_reduce(tensor, dop)
        if op == 'mean':
            tensor /= get_world_size()
        ret = tensor.item()
    else:
        ret = value
    return ret
def get_rank():
    """
    Gets distributed rank or returns zero if distributed is not initialized.
    """
    if dist.is_available() and dist.is_initialized():
        rank = dist.get_rank()
    else:
        rank = 0
    return rank
def get_world_size():
    """
    Gets total number of distributed workers or returns one if distributed is
    not initialized.
    """
    if dist.is_available() and dist.is_initialized():
        world_size = dist.get_world_size()
    else:
        world_size = 1
    return world_size
Exemple #4
0
def dist_init(fn, args):
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        np.random.seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)
        cudnn.deterministic = True

        if cudnn.deterministic:
            warnings.warn('You have chosen to seed training. '
                          'This will turn on the CUDNN deterministic setting, '
                          'which can slow down your training considerably! '
                          'You may see unexpected behavior when restarting '
                          'from checkpoints.')

    args.is_distributed = len(args.hosts) > 1 and args.backend is not None
    args.is_multigpus = args.num_gpus > 1
    args.multigpus_distributed = (args.is_distributed or args.is_multigpus)

    logger.debug("multigpus_distributed - {}".format(
        args.multigpus_distributed))
    logger.debug("Number of gpus available - {}".format(args.num_gpus))

    #     print("######### Start Training #########")

    if args.multigpus_distributed:
        if args.apex:
            # Initialize the distributed environment.
            mp.spawn(fn, nprocs=args.num_gpus, args=(args, ))
        else:
            if args.data_parallel and not sdp.is_initialized():
                sdp.init_process_group()
            elif args.model_parallel and not smp.is_initialized():
                smp.init()

            fn(None, args)

            if args.model_parallel:
                smp.barrier()
    else:
        fn(0, args)
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import time
from argparse import ArgumentParser
import torch
import numpy as np
from torch.optim.lr_scheduler import MultiStepLR

import smdistributed.dataparallel.torch.distributed as herring
if not herring.is_initialized():
    herring.init_process_group()

import torch.utils.data.distributed
from src.model import SSD300, ResNet, Loss
from src.utils import dboxes300_coco, Encoder
from src.logger import Logger, BenchLogger
from src.evaluate import evaluate
from src.train import train_loop, tencent_trick, load_checkpoint, benchmark_train_loop, benchmark_inference_loop
from src.data import get_train_loader, get_val_dataset, get_val_dataloader, get_coco_ground_truth

import dllogger as DLLogger


# Apex imports
try:
Exemple #6
0
from maskrcnn_benchmark.modeling.detector import build_detection_model
from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
from maskrcnn_benchmark.utils.collect_env import collect_env_info
from maskrcnn_benchmark.utils.comm import synchronize, get_rank, is_main_process
from maskrcnn_benchmark.utils.imports import import_file
from maskrcnn_benchmark.utils.logger import setup_logger
from maskrcnn_benchmark.utils.miscellaneous import mkdir
from maskrcnn_benchmark.engine.tester import test

# Import SMDataParallel modules for PyTorch.
from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as DDP
import smdistributed.dataparallel.torch.distributed as dist
dist.init_process_group()

# SMDataParallel: Initialize
if not dist.is_initialized():
    dist.init_process_group()


# See if we can use apex.DistributedDataParallel instead of the torch default,
# and enable mixed-precision via apex.amp
try:
    from apex import amp
    use_amp = True
except ImportError:
    print('Use APEX for multi-precision via apex.amp')
    use_amp = False
# try:
#     from apex.parallel import DistributedDataParallel as DDP
#     use_apex_ddp = True
# except ImportError:
def barrier():
    """
    Call dist.barrier() if distributed is in use
    """
    if dist.is_available() and dist.is_initialized():
        dist.barrier()
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.distributed
from torchvision import datasets, transforms

########################################################
####### 1. SageMaker Distributed Data Parallel  ########
#######  - Import Package and Initialization    ########
########################################################

import smdistributed.dataparallel.torch.distributed as smdp
from smdistributed.dataparallel.torch.parallel.distributed import DistributedDataParallel as smdpDDP

if not smdp.is_initialized():
    smdp.init_process_group()

#######################################################

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))


# Based on https://github.com/pytorch/examples/blob/master/mnist/main.py
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)