Exemple #1
0
def distributed_init(config):
    if config.distributed.world_size == 1:
        raise ValueError(
            "Cannot initialize distributed with distributed_world_size=1")
    logger.info(f"XLA Mode:{is_xla()}")

    if is_xla():
        config.device_id = xm.get_local_ordinal()
        config.distributed.rank = xm.get_ordinal()
    elif dist.is_initialized():
        warnings.warn(
            "Distributed is already initialized, cannot initialize twice!")
        config.distributed.rank = dist.get_rank()
    else:
        logger.info(f"Distributed Init (Rank {config.distributed.rank}): "
                    f"{config.distributed.init_method}")
        dist.init_process_group(
            backend=config.distributed.backend,
            init_method=config.distributed.init_method,
            world_size=config.distributed.world_size,
            rank=config.distributed.rank,
        )
        logger.info(f"Initialized Host {socket.gethostname()} as Rank "
                    f"{config.distributed.rank}")

        # perform a dummy all-reduce to initialize the NCCL communicator
        dist.all_reduce(torch.zeros(1).cuda())

        suppress_output(is_master())
        config.distributed.rank = dist.get_rank()
    return config.distributed.rank
Exemple #2
0
def distributed_init(config):
    if config.distributed.world_size == 1:
        raise ValueError(
            "Cannot initialize distributed with distributed_world_size=1")
    logger.info(f"XLA Mode:{is_xla()}")

    if is_xla():
        config.device_id = xm.get_local_ordinal()
        config.distributed.rank = xm.get_ordinal()
    elif dist.is_initialized():
        warnings.warn(
            "Distributed is already initialized, cannot initialize twice!")
        config.distributed.rank = dist.get_rank()
    else:
        logger.info(f"Distributed Init (Rank {config.distributed.rank}): "
                    f"{config.distributed.init_method}")

        nccl_config = config.distributed.get("nccl", {})

        if nccl_config.get("nsocks_perthread", None):
            os.environ["NCCL_NSOCKS_PERTHREAD"] = str(
                nccl_config["nsocks_perthread"])
            logger.info(
                f"NCCL_NSOCKS_PERTHREAD: {os.environ['NCCL_NSOCKS_PERTHREAD']}"
            )

        if nccl_config.get("socket_nthreads", None):
            os.environ["NCCL_SOCKET_NTHREADS"] = str(
                nccl_config["socket_nthreads"])
            logger.info(
                f"NCCL_SOCKET_NTHREADS: {os.environ['NCCL_SOCKET_NTHREADS']}")

        dist.init_process_group(
            backend=config.distributed.backend,
            init_method=config.distributed.init_method,
            world_size=config.distributed.world_size,
            rank=config.distributed.rank,
        )
        logger.info(f"Initialized Host {socket.gethostname()} as Rank "
                    f"{config.distributed.rank}")

        if "MASTER_ADDR" not in os.environ or "MASTER_PORT" not in os.environ:
            # Set for onboxdataloader support
            split = config.distributed.init_method.split("//")
            assert len(split) == 2, (
                "host url for distributed should be split by '//' " +
                "into exactly two elements")

            split = split[1].split(":")
            assert (len(split) == 2
                    ), "host url should be of the form <host_url>:<host_port>"
            os.environ["MASTER_ADDR"] = split[0]
            os.environ["MASTER_PORT"] = split[1]

        # perform a dummy all-reduce to initialize the NCCL communicator
        dist.all_reduce(torch.zeros(1).cuda())

        suppress_output(is_main())
        config.distributed.rank = dist.get_rank()
    return config.distributed.rank
    def new_process(self, process_idx: int, trainer, mp_queue) -> None:
        self.mp_queue = mp_queue

        reset_seed()

        self.tpu_local_core_rank = xm.get_local_ordinal()
        self.tpu_global_core_rank = xm.get_ordinal()

        # set warning rank
        rank_zero_only.rank = self.global_rank

        if self.tpu_global_core_rank != 0 and trainer.progress_bar_callback is not None:
            trainer.progress_bar_callback.disable()

        self.model_to_device()
        trainer.accelerator.setup_optimizers(trainer)
        trainer.precision_plugin.connect(self._model, None, None)

        self.barrier("pre-run-stage")

        results = trainer.run_stage()

        self.transfer_distrib_spawn_state_on_fit_end(results)

        # https://github.com/pytorch/xla/issues/1801#issuecomment-602799542
        self.barrier("end-process")

        # https://github.com/pytorch/xla/issues/2190#issuecomment-641665358
        if self.local_rank == 0:
            time.sleep(2)
Exemple #4
0
    def configure_device(self) -> None:
        if self.config.training.get("device", "cuda") == "xla":
            import torch_xla.core.xla_model as xm

            self.device = xm.xla_device()
            self.distributed = True
            self.local_rank = xm.get_local_ordinal()
            is_xla = True
        else:
            is_xla = False
            self.local_rank = self.config.device_id
            self.device = self.local_rank
            self.distributed = False

        # Will be updated later based on distributed setup
        registry.register("global_device", self.device)

        if self.config.distributed.init_method is not None:
            self.distributed = True
            self.device = torch.device("cuda", self.local_rank)
            torch.cuda.set_device(self.local_rank)
        elif torch.cuda.is_available():
            self.device = torch.device("cuda")
            torch.cuda.set_device(0)
        elif not is_xla:
            self.device = torch.device("cpu")

        registry.register("global_device", self.config.distributed.rank)
Exemple #5
0
    def _thread_fn(local_ordinal, global_ordinal):
      pjrt.set_local_ordinal(local_ordinal)
      pjrt.set_global_ordinal(global_ordinal)

      time.sleep(1)

      return xm.get_local_ordinal(), xm.get_ordinal()
    def __setup_tpu_training(self, model: LightningModule, trainer):
        # use the default device from the process
        # tpu_device = xm.xla_device()

        # if given an ordinal device, use this as the device
        if trainer.tpu_id is not None:
            tpu_device = xm.xla_device(trainer.tpu_id)
        else:
            tpu_device = xm.xla_device()
        # track the device and move model to it
        trainer._device = tpu_device
        model.to(trainer._device)

        # get the appropriate tpu ranks
        trainer.tpu_local_core_rank = xm.get_local_ordinal()
        trainer.tpu_global_core_rank = xm.get_ordinal()

        # avoid duplicating progress bar
        if trainer.tpu_global_core_rank != 0 and trainer.progress_bar_callback is not None:
            trainer.progress_bar_callback.disable()

        trainer.global_rank = trainer.tpu_local_core_rank
        rank_zero_only.rank = trainer.global_rank

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.setup_optimizers(model)

        # init 16 bit for TPU
        if trainer.precision == 16:
            os.environ['XLA_USE_BF16'] = str(1)

        log.info(f'INIT TPU local core: {trainer.tpu_local_core_rank},'
                 f' global rank: {trainer.tpu_global_core_rank}'
                 f' with XLA_USE_BF16={os.environ.get("XLA_USE_BF16")}')
    def tpu_train(self, tpu_core_idx, model):
        # put model on tpu
        model.to(xm.xla_device())

        # get the appropriate tpu ranks
        self.tpu_local_core_rank = xm.get_local_ordinal()
        self.tpu_global_core_rank = xm.get_ordinal()

        # avoid duplicating progress bar
        self.show_progress_bar = self.show_progress_bar and self.tpu_global_core_rank == 0

        # track current tpu
        self.current_tpu_idx = tpu_core_idx
        self.proc_rank = self.tpu_local_core_rank

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(
            model)

        # init 16 bit for TPU
        if self.precision == 16:
            os.environ['XLA_USE_BF16'] = str(1)

        log.info(f'INIT TPU local core: {self.tpu_local_core_rank},'
                 f' global rank: {self.tpu_global_core_rank}')

        # continue training routine
        self.run_pretrain_routine(model)

        self.save_spawn_weights(model)
Exemple #8
0
    def tpu_train(self, model):
        # put model on tpu
        self._device = xm.xla_device(
            self.tpu_id) if self.tpu_id is not None else xm.xla_device()
        model.to(self._device)

        # get the appropriate tpu ranks
        self.tpu_local_core_rank = xm.get_local_ordinal()
        self.tpu_global_core_rank = xm.get_ordinal()

        # avoid duplicating progress bar
        if self.tpu_global_core_rank != 0 and self.progress_bar_callback is not None:
            self.progress_bar_callback.disable()

        self.proc_rank = self.tpu_local_core_rank
        rank_zero_only.rank = self.proc_rank

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(
            model)

        # init 16 bit for TPU
        if self.precision == 16:
            os.environ['XLA_USE_BF16'] = str(1)

        log.info(f'INIT TPU local core: {self.tpu_local_core_rank},'
                 f' global rank: {self.tpu_global_core_rank}')

        # continue training routine
        self.run_pretrain_routine(model)

        # when training ends on these platforms dump weights to get out of the main process
        if self.on_colab_kaggle:
            self.save_spawn_weights(model)
def distributed_init(args):
    if args.distributed_world_size == 1:
        raise ValueError(
            'Cannot initialize distributed with distributed_world_size=1')

    if not getattr(args, 'tpu', False):
        if torch.distributed.is_initialized():
            warnings.warn(
                'Distributed is already initialized, cannot initialize twice!')
        else:
            logger.info('distributed init (rank {}): {}'.format(
                args.distributed_rank,
                args.distributed_init_method,
            ))
            dist.init_process_group(
                backend=args.distributed_backend,
                init_method=args.distributed_init_method,
                world_size=args.distributed_world_size,
                rank=args.distributed_rank,
            )
            logger.info('initialized host {} as rank {}'.format(
                socket.gethostname(),
                args.distributed_rank,
            ))

            # perform a dummy all-reduce to initialize the NCCL communicator
            if torch.cuda.is_available():
                dist.all_reduce(torch.zeros(1).cuda())

        args.distributed_rank = torch.distributed.get_rank()
    else:
        import torch_xla.core.xla_model as xm
        assert xm.xrt_world_size() == args.distributed_world_size
        args.device_id = xm.get_local_ordinal()
        args.distributed_rank = xm.get_ordinal()
        xm.rendezvous('distributed_init')  # wait for all workers
        xm.mark_step()

    if is_master(args):
        logging.getLogger().setLevel(logging.INFO)
    else:
        logging.getLogger().setLevel(logging.WARNING)

    if args.model_parallel_size > 1:
        try:
            from fairseq.model_parallel.megatron.mpu import (
                get_model_parallel_rank,
                initialize_model_parallel,
                model_parallel_cuda_manual_seed,
            )
        except ImportError:
            raise ImportError('\n\nPlease install the megatron submodule:'
                              '\n\n  git submodule update --init '
                              'fairseq/model_parallel/megatron')
        initialize_model_parallel(args.model_parallel_size)
        model_parallel_cuda_manual_seed(args.seed)
        model_part_number = get_model_parallel_rank()
        args.checkpoint_suffix += '-model_part-{0}'.format(model_part_number)
    return args.distributed_rank
def _extract_metrics_file():
  # Delay xla_model import to avoid cross dependencies.
  import torch_xla.core.xla_model as xm
  metrics_file = os.environ.get('XLA_METRICS_FILE', None)
  if metrics_file is not None:
    ordinal = xm.get_local_ordinal(defval=-1)
    if ordinal >= 0 and xm.xrt_world_size() > 1:
      metrics_file = '{}.{}'.format(metrics_file, ordinal)
  return metrics_file
Exemple #11
0
    def pre_dispatch(self) -> None:
        if isinstance(self.device, int):
            self.device = xm.xla_device(self.device)

        if self.debug:
            os.environ["PT_XLA_DEBUG"] = str(1)

        self.tpu_local_core_rank = xm.get_local_ordinal()
        self.tpu_global_core_rank = xm.get_ordinal()
 def local_process_index(self):
     """
     The index of the local process used.
     """
     if is_torch_tpu_available():
         return xm.get_local_ordinal()
     elif is_sagemaker_mp_enabled():
         return smp.local_rank()
     elif is_sagemaker_dp_enabled():
         return sm_dist.get_rank()
     elif self.local_rank != -1:
         return self.local_rank
     return 0
    def setup(self, trainer: "pl.Trainer") -> None:
        shared_params = find_shared_parameters(self.model)
        self.model_to_device()
        if is_overridden("on_post_move_to_device", self.lightning_module):
            self.model.on_post_move_to_device()
        else:
            set_shared_parameters(self.model, shared_params)

        super().setup(trainer)

        if self.debug:
            os.environ["PT_XLA_DEBUG"] = str(1)

        self.tpu_local_core_rank = xm.get_local_ordinal()
        self.tpu_global_core_rank = xm.get_ordinal()
Exemple #14
0
    def configure_device(self) -> None:
        if self.config.training.get("device", "cuda") == "xla":
            import torch_xla.core.xla_model as xm

            self.device = xm.xla_device()
            self.distributed = True
            self.local_rank = xm.get_local_ordinal()
            is_xla = True
        else:
            is_xla = False
            if "device_id" not in self.config:
                warnings.warn(
                    "No 'device_id' in 'config', setting to -1. "
                    "This can cause issues later in training. Ensure that "
                    "distributed setup is properly initialized.")
                self.local_rank = -1
            else:
                self.local_rank = self.config.device_id
            self.device = self.local_rank
            self.distributed = False

        # Will be updated later based on distributed setup
        registry.register("global_device", self.device)

        if self.config.distributed.init_method is not None:
            self.distributed = True
            self.device = torch.device("cuda", self.local_rank)
            torch.cuda.set_device(self.local_rank)
        elif torch.cuda.is_available():
            self.device = torch.device("cuda")
            torch.cuda.set_device(0)
        elif not is_xla:
            self.device = torch.device("cpu")

        if "rank" not in self.config.distributed:
            if torch.distributed.is_available(
            ) and torch.distributed.is_initialized():
                global_rank = torch.distributed.get_rank()
            else:
                global_rank = -1
            with open_dict(self.config.distributed):
                self.config.distributed.rank = global_rank

        registry.register("global_device", self.config.distributed.rank)
Exemple #15
0
def build_progress_bar(args, iterator, epoch=None, prefix=None, default='tqdm', no_progress_bar='none'):
    if args.log_format is None:
        args.log_format = no_progress_bar if args.no_progress_bar else default

    if args.log_format == 'tqdm' and not sys.stderr.isatty():
        args.log_format = 'simple'

    if args.log_format == 'json':
        bar = json_progress_bar(iterator, epoch, prefix, args.log_interval)
    elif args.log_format == 'none':
        bar = noop_progress_bar(iterator, epoch, prefix)
    elif args.log_format == 'simple':
        bar = simple_progress_bar(iterator, epoch, prefix, args.log_interval)
    elif args.log_format == 'tqdm':
        bar = tqdm_progress_bar(iterator, epoch, prefix)
    else:
        raise ValueError('Unknown log format: {}'.format(args.log_format))

    if args.tbmf_wrapper and distributed_utils.is_master(args):
        global g_tbmf_wrapper
        if g_tbmf_wrapper is None:
            try:
                from fairseq.fb_tbmf_wrapper import fb_tbmf_wrapper
            except Exception:
                raise ImportError("fb_tbmf_wrapper package not found.")
            g_tbmf_wrapper = fb_tbmf_wrapper
        bar = g_tbmf_wrapper(bar, args, args.log_interval)
    elif (
        args.tensorboard_logdir
        and getattr(args, 'use_gpu', True)
        and distributed_utils.is_master(args)
    ):
        bar = tensorboard_log_wrapper(bar, args.tensorboard_logdir, args)
    elif args.tensorboard_logdir and not getattr(args, 'use_gpu', True):
        # tpu-comment: making every core have a tensorboard writer guarantees
        #   the same work accross cores.
        logdir = os.path.join(
            args.tensorboard_logdir, str(xm.get_local_ordinal())
        )
        bar = tensorboard_log_wrapper_xla(bar, logdir, args)
    return bar
Exemple #16
0
def _mp_fn(index, temp_file):
    device = xm.xla_device()
    dd = _create_state_dict(device)
    xm.save(dd, temp_file)
    ldd = torch.load(temp_file)
    pdd = _get_data_str(ldd)
    data = xm.rendezvous('xm_save_test', pdd)
    if xm.get_local_ordinal() == 0:
        os.remove(temp_file)
    for i in range(1, len(data)):
        bio = io.BytesIO(data[i])
        ildd = torch.load(bio)
        for k, v in ldd.items():
            if isinstance(v, torch.Tensor):
                assert v.allclose(ildd[k])
            elif isinstance(v, (list, tuple)):
                iv = ildd[k]
                for a, b in zip(v, iv):
                    assert a.allclose(b)
            else:
                raise RuntimeError('Invalid data type')
Exemple #17
0
 def __init__(self,
              fp16: bool = None,
              cpu: bool = False,
              _from_accelerator: bool = False):
     self.__dict__ = self._shared_state
     if not getattr(self, "initialized", False):
         if not _from_accelerator:
             raise ValueError(
                 "Please make sure to properly initialize your accelerator via `accelerator = Accelerator()` "
                 "before using any functionality from the `accelerate` library."
             )
         elif is_tpu_available() and not cpu:
             self.distributed_type = DistributedType.TPU
             self.num_processes = xm.xrt_world_size()
             self.process_index = xm.get_ordinal()
             self.local_process_index = xm.get_local_ordinal()
             self.device = xm.xla_device()
             self.use_fp16 = False
         elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu:
             self.distributed_type = DistributedType.MULTI_GPU
             if not torch.distributed.is_initialized():
                 torch.distributed.init_process_group(backend="nccl")
             self.num_processes = torch.distributed.get_world_size()
             self.process_index = torch.distributed.get_rank()
             self.local_process_index = int(os.environ.get(
                 "LOCAL_RANK", -1))
             self.device = torch.device("cuda", self.local_process_index)
             torch.cuda.set_device(self.device)
             self.use_fp16 = parse_flag_from_env(
                 "USE_FP16", False) if fp16 is None else fp16
         else:
             self.distributed_type = DistributedType.NO
             self.num_processes = 1
             self.process_index = self.local_process_index = 0
             self.device = torch.device(
                 "cuda" if torch.cuda.is_available() and not cpu else "cpu")
             self.use_fp16 = parse_flag_from_env(
                 "USE_FP16", False) if fp16 is None else fp16
         self.initialized = True
    def __setup_tpu_training(self, model):
        # use the default device from the process
        tpu_device = xm.xla_device()

        # if given an ordinal device, use this as the device
        if self.trainer.tpu_id is not None:
            tpu_device = xm.xla_device(self.trainer.tpu_id)

        # track the device and move model to it
        self.trainer._device = tpu_device
        model.to(self.trainer._device)

        # get the appropriate tpu ranks
        self.trainer.tpu_local_core_rank = xm.get_local_ordinal()
        self.trainer.tpu_global_core_rank = xm.get_ordinal()

        # avoid duplicating progress bar
        if self.trainer.tpu_global_core_rank != 0 and self.trainer.progress_bar_callback is not None:
            self.trainer.progress_bar_callback.disable()

        self.trainer.global_rank = self.trainer.tpu_local_core_rank
        rank_zero_only.rank = self.trainer.global_rank

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(model)
        self.trainer.optimizers = optimizers
        self.trainer.lr_schedulers = lr_schedulers
        self.trainer.optimizer_frequencies = optimizer_frequencies

        # init 16 bit for TPU
        if self.trainer.precision == 16:
            os.environ['XLA_USE_BF16'] = str(1)

        log.info(f'INIT TPU local core: {self.trainer.tpu_local_core_rank},'
                 f' global rank: {self.trainer.tpu_global_core_rank}')
def distributed_init(cfg: FairseqConfig):
    if isinstance(cfg, Namespace):
        from fairseq.dataclass.utils import convert_namespace_to_omegaconf

        cfg = convert_namespace_to_omegaconf(cfg)

    if not cfg.common.tpu:
        if torch.distributed.is_available(
        ) and torch.distributed.is_initialized():
            warnings.warn(
                "Distributed is already initialized, cannot initialize twice!")
        else:
            logger.info("distributed init (rank {}): {}".format(
                cfg.distributed_training.distributed_rank,
                cfg.distributed_training.distributed_init_method,
            ))
            dist.init_process_group(
                backend=cfg.distributed_training.distributed_backend,
                init_method=cfg.distributed_training.distributed_init_method,
                world_size=cfg.distributed_training.distributed_world_size,
                rank=cfg.distributed_training.distributed_rank,
            )
            logger.info("initialized host {} as rank {}".format(
                socket.gethostname(),
                cfg.distributed_training.distributed_rank,
            ))

            # perform a dummy all-reduce to initialize the NCCL communicator
            if torch.cuda.is_available():
                dist.all_reduce(torch.zeros(1).cuda())

        cfg.distributed_training.distributed_rank = torch.distributed.get_rank(
        )
    else:
        assert xm.xrt_world_size(
        ) == cfg.distributed_training.distributed_world_size
        global _USE_XLA
        _USE_XLA = True
        cfg.distributed_training.device_id = xm.get_local_ordinal()
        cfg.distributed_training.distributed_rank = xm.get_ordinal()
        xm.rendezvous("distributed_init")  # wait for all workers
        xm.mark_step()

    if is_master(cfg.distributed_training):
        logging.getLogger().setLevel(logging.INFO)
    else:
        logging.getLogger().setLevel(logging.WARNING)

    if cfg.common.model_parallel_size > 1:
        try:
            from fairseq.model_parallel.megatron.mpu import (
                initialize_model_parallel,
                model_parallel_cuda_manual_seed,
            )
        except ImportError:
            raise ImportError("\n\nPlease install the megatron submodule:"
                              "\n\n  git submodule update --init "
                              "fairseq/model_parallel/megatron")
        global _USE_MEGATRON
        _USE_MEGATRON = True
        initialize_model_parallel(cfg.common.model_parallel_size)
        model_parallel_cuda_manual_seed(cfg.common.seed)
        model_part_number = get_model_parallel_rank()
        cfg.checkpoint.checkpoint_suffix += "-model_part-{0}".format(
            model_part_number)

    return cfg.distributed_training.distributed_rank
Exemple #20
0
 def get_local_rank(self) -> int:
     return xm.get_local_ordinal()
Exemple #21
0
 def _worker_setup(self, process_idx: int):
     reset_seed()
     self.tpu_local_core_rank = xm.get_local_ordinal()
     self.tpu_global_core_rank = xm.get_ordinal()
     rank_zero_only.rank = self.global_rank
 def set_world_ranks(self, process_idx: int) -> None:
     self.tpu_local_core_rank = xm.get_local_ordinal()
     self.tpu_global_core_rank = xm.get_ordinal()
     self.global_rank = self.tpu_local_core_rank
     self.world_size = self.num_nodes * self.num_processes
Exemple #23
0
 def __init__(self,
              fp16: bool = None,
              cpu: bool = False,
              deepspeed_plugin=None,
              _from_accelerator: bool = False,
              **kwargs):
     self.__dict__ = self._shared_state
     if not getattr(self, "initialized", False):
         self.backend = None
         self.deepspeed_plugin = None
         if not _from_accelerator:
             raise ValueError(
                 "Please make sure to properly initialize your accelerator via `accelerator = Accelerator()` "
                 "before using any functionality from the `accelerate` library."
             )
         elif is_tpu_available() and not cpu:
             self.distributed_type = DistributedType.TPU
             self.num_processes = xm.xrt_world_size()
             self.process_index = xm.get_ordinal()
             self.local_process_index = xm.get_local_ordinal()
             self.device = xm.xla_device()
             self.use_fp16 = False
         elif os.environ.get("USE_DEEPSPEED",
                             "false") == "true" and not cpu:
             assert (
                 is_deepspeed_available()
             ), "DeepSpeed is not available => install it using `pip3 install deepspeed` or build it from source"
             self.distributed_type = DistributedType.DEEPSPEED
             if not torch.distributed.is_initialized():
                 torch.distributed.init_process_group(backend="nccl",
                                                      **kwargs)
                 self.backend = "nccl"
             self.num_processes = torch.distributed.get_world_size()
             self.process_index = torch.distributed.get_rank()
             self.local_process_index = int(os.environ.get(
                 "LOCAL_RANK", -1))
             self.device = torch.device("cuda", self.local_process_index)
             torch.cuda.set_device(self.device)
             self.use_fp16 = False  # deepspeed handles fp16 using deepspeed_config
             fp16 = parse_flag_from_env("USE_FP16",
                                        False) if fp16 is None else fp16
             deepspeed_plugin.deepspeed_config.update(
                 {"fp16": {
                     "enabled": fp16
                 }})
             self.deepspeed_plugin = deepspeed_plugin
         elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu:
             self.distributed_type = DistributedType.MULTI_GPU
             if not torch.distributed.is_initialized():
                 torch.distributed.init_process_group(backend="nccl",
                                                      **kwargs)
                 self.backend = "nccl"
             self.num_processes = torch.distributed.get_world_size()
             self.process_index = torch.distributed.get_rank()
             self.local_process_index = int(os.environ.get(
                 "LOCAL_RANK", -1))
             self.device = torch.device("cuda", self.local_process_index)
             torch.cuda.set_device(self.device)
             self.use_fp16 = parse_flag_from_env(
                 "USE_FP16", False) if fp16 is None else fp16
         elif get_int_from_env([
                 "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE",
                 "WORLD_SIZE"
         ], 1) > 1:
             self.distributed_type = DistributedType.MULTI_CPU
             if is_ccl_available() and get_int_from_env(
                 ["CCL_WORKER_COUNT"], 0) > 0:
                 backend = "ccl"
             elif torch.distributed.is_mpi_available():
                 backend = "mpi"
             else:
                 backend = "gloo"
             # Try to get launch configuration from environment variables set by MPI launcher - works for Intel MPI, OpenMPI and MVAPICH
             rank = get_int_from_env([
                 "RANK", "PMI_RANK", "OMPI_COMM_WORLD_RANK",
                 "MV2_COMM_WORLD_RANK"
             ], 0)
             size = get_int_from_env([
                 "WORLD_SIZE", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE",
                 "MV2_COMM_WORLD_SIZE"
             ], 1)
             local_rank = get_int_from_env([
                 "LOCAL_RANK", "MPI_LOCALRANKID",
                 "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"
             ], 0)
             local_size = get_int_from_env([
                 "MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE",
                 "MV2_COMM_WORLD_LOCAL_SIZE"
             ], 1)
             self.local_process_index = local_rank
             os.environ["RANK"] = str(rank)
             os.environ["WORLD_SIZE"] = str(size)
             os.environ["LOCAL_RANK"] = str(local_rank)
             if not os.environ.get("MASTER_PORT", None):
                 os.environ["MASTER_PORT"] = "29500"
             if not os.environ.get("MASTER_ADDR", None):
                 if local_size != size and backend != "mpi":
                     raise ValueError(
                         "Looks like distributed multinode run but MASTER_ADDR env not set, "
                         "please try exporting rank 0's hostname as MASTER_ADDR"
                     )
             if not torch.distributed.is_initialized():
                 torch.distributed.init_process_group(backend,
                                                      rank=rank,
                                                      world_size=size,
                                                      **kwargs)
                 self.backend = backend
             self.num_processes = torch.distributed.get_world_size()
             self.process_index = torch.distributed.get_rank()
             self.local_process_index = local_rank
             self.device = torch.device("cpu")
             self.use_fp16 = False
         else:
             self.distributed_type = DistributedType.NO
             self.num_processes = 1
             self.process_index = self.local_process_index = 0
             self.device = torch.device(
                 "cuda" if torch.cuda.is_available() and not cpu else "cpu")
             self.use_fp16 = parse_flag_from_env(
                 "USE_FP16", False) if fp16 is None else fp16
         self.initialized = True
Exemple #24
0
 def set_world_ranks(self, process_idx: int = 0) -> None:
     self.tpu_local_core_rank = xm.get_local_ordinal()
     self.tpu_global_core_rank = xm.get_ordinal()
Exemple #25
0
  def test_default_ordinals(self):
    global_ordinal = xm.get_ordinal()
    self.assertEqual(global_ordinal, 0)

    local_ordinal = xm.get_local_ordinal()
    self.assertEqual(local_ordinal, 0)
Exemple #26
0
 def __init__(self, fp16: bool = None, cpu: bool = False, _from_accelerator: bool = False):
     self.__dict__ = self._shared_state
     if not getattr(self, "initialized", False):
         self.backend = None
         if not _from_accelerator:
             raise ValueError(
                 "Please make sure to properly initialize your accelerator via `accelerator = Accelerator()` "
                 "before using any functionality from the `accelerate` library."
             )
         elif is_tpu_available() and not cpu:
             self.distributed_type = DistributedType.TPU
             self.num_processes = xm.xrt_world_size()
             self.process_index = xm.get_ordinal()
             self.local_process_index = xm.get_local_ordinal()
             self.device = xm.xla_device()
             self.use_fp16 = False
         elif int(os.environ.get("LOCAL_RANK", -1)) != -1 and not cpu:
             self.distributed_type = DistributedType.MULTI_GPU
             if not torch.distributed.is_initialized():
                 torch.distributed.init_process_group(backend="nccl")
                 self.backend = "nccl"
             self.num_processes = torch.distributed.get_world_size()
             self.process_index = torch.distributed.get_rank()
             self.local_process_index = int(os.environ.get("LOCAL_RANK", -1))
             self.device = torch.device("cuda", self.local_process_index)
             torch.cuda.set_device(self.device)
             self.use_fp16 = parse_flag_from_env("USE_FP16", False) if fp16 is None else fp16
         elif env2int(["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"]) > 1:
             self.distributed_type = DistributedType.MULTI_CPU
             if is_ccl_available() and env2int(["CCL_WORKER_COUNT"]) > 0:
                 backend = "ccl"
             elif torch.distributed.is_mpi_available():
                 backend = "mpi"
             else:
                 backend = "gloo"
             rank = env2int(["RANK", "PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK"], 0)
             size = env2int(["WORLD_SIZE", "PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE"], 1)
             local_rank = env2int(["LOCAL_RANK", "MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "MV2_COMM_WORLD_LOCAL_RANK"], 0)
             local_size = env2int(["MPI_LOCALNRANKS", "OMPI_COMM_WORLD_LOCAL_SIZE", "MV2_COMM_WORLD_LOCAL_SIZE"], 1)
             self.local_process_index = local_rank
             os.environ["RANK"] = str(rank)
             os.environ["WORLD_SIZE"] = str(size)
             if not os.environ.get("MASTER_PORT", None): os.environ["MASTER_PORT"] = "29500"
             if not os.environ.get("MASTER_ADDR", None):
                 if local_size != size and backend != "mpi":
                     print("Warning: Looks like distributed multinode run but MASTER_ADDR env not set, using '127.0.0.1' as default")
                     print("If this run hangs, try exporting rank 0's hostname as MASTER_ADDR")
                     os.environ["MASTER_ADDR"] = "127.0.0.1"
             if not torch.distributed.is_initialized():
                 torch.distributed.init_process_group(backend, rank=rank, world_size=size)
                 self.backend = backend
             self.num_processes = torch.distributed.get_world_size()
             self.process_index = torch.distributed.get_rank()
             self.local_process_index = local_rank
             self.local_num_processes = local_size
             self.device = torch.device("cpu")
             self.use_fp16 = False
         else:
             self.distributed_type = DistributedType.NO
             self.num_processes = 1
             self.process_index = self.local_process_index = 0
             self.device = torch.device("cuda" if torch.cuda.is_available() and not cpu else "cpu")
             self.use_fp16 = parse_flag_from_env("USE_FP16", False) if fp16 is None else fp16
         self.initialized = True
Exemple #27
0
    def pre_training(self) -> None:
        if isinstance(self.device, int):
            self.device = xm.xla_device(self.device)

        self.tpu_local_core_rank = xm.get_local_ordinal()
        self.tpu_global_core_rank = xm.get_ordinal()
Exemple #28
0
 def get_local_rank(self, ):
     return xm.get_local_ordinal()