Beispiel #1
0
    def calculate_derived(self):
        """
        Derives additional configuration values necessary for training from the current config
        """

        # wandb
        # sets a unique wandb group
        if self.wandb_group is None:
            # if none is defined a uuid is set for the run
            self.wandb_group = shortuuid.uuid()

        # number of gpus
        # Get number of GPUs param or hostfile to determine train_batch_size
        num_gpus = self.num_gpus
        if num_gpus is None:
            num_gpus = -1  # set -1 for backwards compatibility to old default value
        if num_gpus < 1:
            if self.hostfile is not None or os.path.exists(DLTS_HOSTFILE):
                hostfile_path = self.hostfile or DLTS_HOSTFILE
                resources = obtain_resource_pool(hostfile_path, self.include or "", self.exclude or "")
                num_gpus = sum(map(len, resources.values()))
            else:
                num_gpus = torch.cuda.device_count()
        self.update_value("num_gpus", num_gpus)

        logging.info(
            self.__class__.__name__ + ".calculate_derived() " + f"Total number of GPUs determined to be: {self.num_gpus}")

        # get world size in the model/pipe parallel case, the actual `world size` deepspeed uses is the size of the
        # data-parallel group, or (num_gpus / mp_size) / pp_size
        pp_size = self.pipe_parallel_size
        pp_size = pp_size if pp_size >= 1 else 1
        mp_size = self.model_parallel_size
        mp_size = mp_size if mp_size >= 1 else 1
        self.update_value("model_parallel_size", mp_size)

        # pp_size and mp_size are only used here to compute dp world size and nowhere else.
        dp_world_size = ((num_gpus / pp_size) / mp_size)
        if not (dp_world_size % 1 == 0):
            error_message = self.__class__.__name__ + ".calculate_derived() " + f"(num_gpus / pp_size) / mp_size [({num_gpus} / {pp_size}) / {mp_size}] must be a whole number"
            logging.error(error_message)
            raise AssertionError(error_message)

        # Automatically derive train_batch_size = train_micro_batch_size_per_gpu*num_gpus*gradient_accumulation_steps
        train_batch_size, train_micro_batch_size_per_gpu, gradient_accumulation_steps = self.calculate_batch_parameters(
            dp_world_size=dp_world_size,
            train_batch=self.train_batch_size,
            micro_batch=self.train_micro_batch_size_per_gpu,
            grad_acc=self.gradient_accumulation_steps
        )
        self.check_batch_parameters(
            dp_world_size=dp_world_size,
            train_batch=train_batch_size,
            micro_batch=train_micro_batch_size_per_gpu,
            grad_acc=gradient_accumulation_steps
        )
        self.update_values({
            # batch size params
            "train_batch_size": train_batch_size,
            "train_micro_batch_size_per_gpu": train_micro_batch_size_per_gpu,
            "gradient_accumulation_steps": gradient_accumulation_steps,
            "batch_size": train_micro_batch_size_per_gpu,

            # duplicate items
            "gas": self.gradient_accumulation_steps,
            "clip_grad": self.gradient_clipping,

        })
        
        # derive precision
        if (self.fp16 or {}).get("type", self.precision) == "bfloat16":
            self.update_value("precision", "bfloat16")
        elif (self.fp16 or {}).get("enabled", False):
            self.update_value("precision", "fp16")
        else:
            self.update_value("precision", "fp32")
        

        # zero optimization
        if self.zero_optimization is None:
            self.zero_optimization = copy.deepcopy(ZERO_DEFAULTS)  # a dict is overwritten and not updated key by key
        self.update_values({
            "zero_stage": self.zero_optimization.get('stage', ZERO_DEFAULTS['stage']),
            "zero_reduce_scatter": self.zero_optimization.get('reduce_scatter', ZERO_DEFAULTS['reduce_scatter']),
            "zero_contiguous_gradients": self.zero_optimization.get('contiguous_gradients',
                                                                    ZERO_DEFAULTS['contiguous_gradients']),
            "zero_reduce_bucket_size": self.zero_optimization.get('reduce_bucket_size',
                                                                  ZERO_DEFAULTS['reduce_bucket_size']),
            "zero_allgather_bucket_size": self.zero_optimization.get('allgather_bucket_size',
                                                                     ZERO_DEFAULTS['allgather_bucket_size'])
        })

        # optimizer and scheduler
        opt_params = self.optimizer or {"type": OPT_DEFAULT, "params": OPT_PARAMS_DEFAULTS}
        self.update_values({
            "optimizer_type": opt_params.get('type', OPT_DEFAULT),
            "lr": opt_params['params'].get('lr', OPT_PARAMS_DEFAULTS['lr'])
        })

        if self.optimizer_type.lower() == "onebitadam":
            # onebitadam needs to instantiated by deepspeed, and so we need to pass deepspeed scheduler args
            # for all other optimizers, the scheduling is handled by megatron
            self.scheduler = {
                "type": "WarmupDecayLR",  # for now this is the only ds scheduler offering decay
                "params": {
                    "warmup_min_lr": 0,
                    "warmup_max_lr": self.lr,
                    "warmup_num_steps": int(self.train_iters * self.warmup),
                    "total_num_steps": self.lr_decay_iters or self.train_iters
                }}

        # Fp16 loss scaling.
        self.update_value("dynamic_loss_scale", self.loss_scale is None)

        # Update 'is pipe parallel' flag
        # if we set pipe_parallel_size to 0 or 1, GPT2ModelPipe.to_sequential() is called, and we run training with
        # the sequential model without the PipelineModule wrapper to avoid the overhead it incurs
        self.update_value("is_pipe_parallel", self.pipe_parallel_size >= 1)

        # Attention config
        if self.attention_config is None:
            self.update_value("attention_config", [[["global"], self.num_layers]])
        self.update_value("attention_config", expand_attention_types(self.attention_config, self.num_layers))
        assert len(self.attention_config) == self.num_layers, "Length of attention config list must equal num_layers"
        for item in self.attention_config:
            assert item in ATTENTION_TYPE_CHOICES, f"Attention type {item} not recognized"
        if "gmlp" in self.attention_config or "amlp" in self.attention_config:
            assert not self.partition_activations, "GMLP Blocks are not compatible with partition activations"

        # Sparsity config
        if self.sparsity_config is None:
            # Can't have a default value as an empty dict so need to set it here
            self.update_value("sparsity_config", {})

        # Adding equal dataset weights if none are provided
        if self.train_data_paths and (self.train_data_weights is None):
            self.train_data_weights = [1.] * len(self.train_data_paths)
        if self.valid_data_paths and (self.valid_data_weights is None):
            self.valid_data_weights = [1.] * len(self.valid_data_paths)
        if self.test_data_paths and (self.test_data_weights is None):
            self.test_data_weights = [1.] * len(self.test_data_paths)
Beispiel #2
0
    def derive_params_and_split(conf):
        """
        Derive and insert implicit parameters
        """

        # Get number of GPUs param or hostfile to determine train_batch_size
        num_gpus = conf.get('num_gpus')
        if num_gpus is None:
            if 'hostfile' in conf or os.path.exists(DLTS_HOSTFILE):
                hostfile_path = conf.get('hostfile', DLTS_HOSTFILE)
                resources = obtain_resource_pool(hostfile_path,
                                                 conf.get('include', ''),
                                                 conf.get('exclude', ''))
                num_gpus = sum(map(len, resources.values()))
            else:
                num_gpus = torch.cuda.device_count()
                conf["num_gpus"] = num_gpus

        log.info(f"Total number of GPUs determined to be: {num_gpus}")

        # get world size in the model/pipe parallel case, the actual `world size` deepspeed uses is the size of the
        # data-parallel group, or (num_gpus / mp_size) / pp_size
        pp_size = conf.get('pipe-parallel-size', 0)
        pp_size = pp_size if pp_size >= 1 else 1
        mp_size = conf.get('model-parallel-size', 0)
        mp_size = mp_size if mp_size >= 1 else 1

        # pp_size and mp_size are only used here to compute world_size and nowhere else. The way that these values actually get to deepspeed
        # is through convert_to_old_args. The entire chain of how that happens:
        # https://github.com/EleutherAI/gpt-neox/blob/2ceefba0ef12b94eb35a518f7dea9f34fc43c9af/megatron/arguments.py#L430
        # https://github.com/EleutherAI/gpt-neox/blob/2ceefba0ef12b94eb35a518f7dea9f34fc43c9af/megatron/arguments.py#L45
        # https://github.com/EleutherAI/gpt-neox/blob/2ceefba0ef12b94eb35a518f7dea9f34fc43c9af/megatron/config_monster.py#L17
        # https://github.com/EleutherAI/gpt-neox/blob/2ceefba0ef12b94eb35a518f7dea9f34fc43c9af/megatron/config_monster.py#L40
        # https://github.com/EleutherAI/gpt-neox/blob/2ceefba0ef12b94eb35a518f7dea9f34fc43c9af/megatron/config_monster.py#L330

        world_size = ((num_gpus / pp_size) / mp_size)
        assert world_size % 1 == 0, f"(num_gpus / pp_size) / mp_size [({num_gpus} / {pp_size}) / {mp_size}] must be a whole number"

        # Automatically derive train_batch_size = train_micro_batch_size_per_gpu*num_gpus*gradient_accumulation_steps
        conf['train_batch_size'], conf['train_micro_batch_size_per_gpu'], conf[
            'gradient_accumulation_steps'] = _configure_train_batch_size(
                world_size, conf.get('train_batch_size'),
                conf.get('train_micro_batch_size_per_gpu'),
                conf.get('gradient_accumulation_steps'))
        conf['gradient_accumulation_steps'] = int(
            conf['gradient_accumulation_steps'])
        conf['batch-size'] = conf[
            'train_micro_batch_size_per_gpu']  # we need to pass batch size into megatron

        ds_runner_conf = {
            key: conf[key]
            for key in ds_runner_keys if key in conf
        }
        megatron_conf = {
            key: conf[key]
            for key in megatron_keys + neox_config_keys if key in conf
        }
        ds_config_conf = {
            key: conf[key]
            for key in ds_config_keys if key in conf
        }

        # Items duplicated
        megatron_conf['deepspeed'] = True  # should always be using deepspeed
        ds_config_conf['deepspeed'] = True
        megatron_conf['fp16'] = conf.get('fp16', {}).get('enabled', False)
        megatron_conf['gas'] = conf.get('gradient_accumulation_steps')
        _set_zero_params(ds_config_conf, megatron_conf)
        megatron_conf['clip-grad'] = ds_config_conf.get(
            'gradient_clipping', GRADIENT_CLIPPING_DEFAULT)
        _set_scheduler_params(ds_config_conf, megatron_conf)
        _set_optimizer_params(ds_config_conf, megatron_conf)

        return ds_runner_conf, megatron_conf, ds_config_conf