Example #1
0
def get_dynamic_loss_scale_args(param_dict):
    loss_scale_args = None
    if get_fp16_enabled(param_dict):
        fp16_dict = param_dict[FP16]
        dynamic_loss_args = [
            FP16_INITIAL_SCALE_POWER, FP16_LOSS_SCALE_WINDOW,
            FP16_MIN_LOSS_SCALE, FP16_HYSTERESIS
        ]
        if any(arg in list(fp16_dict.keys()) for arg in dynamic_loss_args):
            init_scale = get_scalar_param(fp16_dict, FP16_INITIAL_SCALE_POWER,
                                          FP16_INITIAL_SCALE_POWER_DEFAULT)
            scale_window = get_scalar_param(fp16_dict, FP16_LOSS_SCALE_WINDOW,
                                            FP16_LOSS_SCALE_WINDOW_DEFAULT)
            delayed_shift = get_scalar_param(fp16_dict, FP16_HYSTERESIS,
                                             FP16_HYSTERESIS_DEFAULT)
            min_loss_scale = get_scalar_param(fp16_dict, FP16_MIN_LOSS_SCALE,
                                              FP16_MIN_LOSS_SCALE_DEFAULT)
            loss_scale_args = {
                INITIAL_LOSS_SCALE: 2**init_scale,
                SCALE_WINDOW: scale_window,
                DELAYED_SHIFT: delayed_shift,
                MIN_LOSS_SCALE: min_loss_scale
            }

    return loss_scale_args
Example #2
0
def get_allgather_size(param_dict):
    return get_scalar_param(param_dict,
                            ALLGATHER_SIZE,
                            ALLGATHER_SIZE_DEFAULT) if get_scalar_param(
                                param_dict,
                                ALLGATHER_SIZE,
                                ALLGATHER_SIZE_DEFAULT) > 0 else ALLGATHER_SIZE_DEFAULT
Example #3
0
    def _initialize(self, zero_config_dict):
        self.stage = get_scalar_param(zero_config_dict,
                                      ZERO_OPTIMIZATION_STAGE,
                                      ZERO_OPTIMIZATION_STAGE_DEFAULT)

        self.contiguous_gradients = get_scalar_param(
            zero_config_dict, ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS,
            ZERO_OPTIMIZATION_CONTIGUOUS_GRADIENTS_DEFAULT)

        self.reduce_bucket_size = get_scalar_param(
            zero_config_dict, ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE,
            ZERO_OPTIMIZATION_REDUCE_BUCKET_SIZE_DEFAULT)

        self.reduce_scatter = get_scalar_param(
            zero_config_dict, ZERO_OPTIMIZATION_REDUCE_SCATTER,
            ZERO_OPTIMIZATION_REDUCE_SCATTER_DEFAULT)

        self.overlap_comm = get_scalar_param(
            zero_config_dict, ZERO_OPTIMIZATION_OVERLAP_COMM,
            ZERO_OPTIMIZATION_OVERLAP_COMM_DEFAULT)

        self.allgather_partitions = get_scalar_param(
            zero_config_dict, ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS,
            ZERO_OPTIMIZATION_ALLGATHER_PARTITIONS_DEFAULT)

        self.allgather_bucket_size = get_scalar_param(
            zero_config_dict, ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE,
            ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT)
        self.load_from_fp32_weights = get_scalar_param(
            zero_config_dict, ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS,
            ZERO_OPTIMIZATION_LOAD_FROM_FP32_WEIGHTS_DEFAULT)
Example #4
0
def get_gradient_clipping(param_dict):
    grad_clip = get_optimizer_gradient_clipping(param_dict)
    if grad_clip is not None:
        return grad_clip
    else:
        return get_scalar_param(param_dict, GRADIENT_CLIPPING,
                                GRADIENT_CLIPPING_DEFAULT)
Example #5
0
def get_loss_scale(param_dict):
    if get_fp16_enabled(param_dict):
        return get_scalar_param(param_dict[FP16],
                                FP16_LOSS_SCALE,
                                FP16_LOSS_SCALE_DEFAULT)
    else:
        return FP16_LOSS_SCALE_DEFAULT
Example #6
0
def get_tensorboard_job_name(param_dict):
    if get_tensorboard_enabled(param_dict):
        return get_scalar_param(param_dict[TENSORBOARD],
                                TENSORBOARD_JOB_NAME,
                                TENSORBOARD_JOB_NAME_DEFAULT)
    else:
        return TENSORBOARD_JOB_NAME_DEFAULT
Example #7
0
def get_tensorboard_enabled(param_dict):
    if TENSORBOARD in param_dict.keys():
        return get_scalar_param(param_dict[TENSORBOARD],
                                TENSORBOARD_ENABLED,
                                TENSORBOARD_ENABLED_DEFAULT)
    else:
        return False
Example #8
0
def get_tensorboard_output_path(param_dict):
    if get_tensorboard_enabled(param_dict):
        return get_scalar_param(param_dict[TENSORBOARD],
                                TENSORBOARD_OUTPUT_PATH,
                                TENSORBOARD_OUTPUT_PATH_DEFAULT)
    else:
        return TENSORBOARD_OUTPUT_PATH_DEFAULT
Example #9
0
def get_initial_dynamic_scale(param_dict):
    if get_fp16_enabled(param_dict):
        initial_scale_power = get_scalar_param(
            param_dict[FP16], FP16_INITIAL_SCALE_POWER,
            FP16_INITIAL_SCALE_POWER_DEFAULT)
    else:
        initial_scale_power = FP16_INITIAL_SCALE_POWER_DEFAULT

    return 2**initial_scale_power
Example #10
0
    def read_zero_config_deprecated(self, param_dict):
        zero_config_dict = {}
        zero_config_dict[
            ZERO_OPTIMIZATION_STAGE] = 1 if param_dict[ZERO_OPTIMIZATION] else 0
        if zero_config_dict[ZERO_OPTIMIZATION_STAGE] > 0:
            zero_config_dict[ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE] = get_scalar_param(
                param_dict,
                ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED,
                ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT)

        logger.warning(
            'DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}'
            .format(ZERO_FORMAT))
        return zero_config_dict
    def _initialize(self, act_chkpt_config_dict):
        self.partition_activations = get_scalar_param(
            act_chkpt_config_dict, ACT_CHKPT_PARTITION_ACTIVATIONS,
            ACT_CHKPT_PARTITION_ACTIVATIONS_DEFAULT)

        self.contiguous_memory_optimization = get_scalar_param(
            act_chkpt_config_dict, ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION,
            ACT_CHKPT_CONTIGUOUS_MEMORY_OPTIMIZATION_DEFAULT)

        self.cpu_checkpointing = get_scalar_param(
            act_chkpt_config_dict, ACT_CHKPT_CPU_CHECKPOINTING,
            ACT_CHKPT_CPU_CHECKPOINTING_DEFAULT)

        self.number_checkpoints = get_scalar_param(
            act_chkpt_config_dict, ACT_CHKPT_NUMBER_CHECKPOINTS,
            ACT_CHKPT_NUMBER_CHECKPOINTS_DEFAULT)

        self.profile = get_scalar_param(act_chkpt_config_dict,
                                        ACT_CHKPT_PROFILE,
                                        ACT_CHKPT_PROFILE_DEFAULT)

        self.synchronize_checkpoint_boundary = get_scalar_param(
            act_chkpt_config_dict, ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY,
            ACT_CHKPT_SYNCHRONIZE_CHECKPOINT_BOUNDARY_DEFAULT)
Example #12
0
def get_sparse_gradients_enabled(param_dict):
    return get_scalar_param(param_dict, SPARSE_GRADIENTS,
                            SPARSE_GRADIENTS_DEFAULT)
Example #13
0
def get_gradient_accumulation_steps(param_dict):
    return get_scalar_param(param_dict, GRADIENT_ACCUMULATION_STEPS,
                            GRADIENT_ACCUMULATION_STEPS_DEFAULT)
Example #14
0
def get_prescale_gradients(param_dict):
    return get_scalar_param(param_dict, PRESCALE_GRADIENTS,
                            PRESCALE_GRADIENTS_DEFAULT)
Example #15
0
def get_allreduce_always_fp32(param_dict):
    return get_scalar_param(param_dict, FP32_ALLREDUCE, FP32_ALLREDUCE_DEFAULT)
Example #16
0
def get_zero_max_elements_per_comm(param_dict):
    return get_scalar_param(param_dict, ZERO_MAX_ELEMENTS_PER_COMM,
                            ZERO_MAX_ELEMENTS_PER_COMM_DEFAULT)
Example #17
0
def get_steps_per_print(param_dict):
    return get_scalar_param(param_dict, STEPS_PER_PRINT,
                            STEPS_PER_PRINT_DEFAULT)
Example #18
0
def get_train_micro_batch_size_per_gpu(param_dict):
    return get_scalar_param(param_dict, TRAIN_MICRO_BATCH_SIZE_PER_GPU,
                            TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT)
Example #19
0
def get_fp16_enabled(param_dict):
    if FP16 in param_dict.keys():
        return get_scalar_param(param_dict[FP16], FP16_ENABLED,
                                FP16_ENABLED_DEFAULT)
    else:
        return False
Example #20
0
def get_amp_enabled(param_dict):
    if AMP in param_dict.keys():
        return get_scalar_param(param_dict[AMP], AMP_ENABLED, AMP_ENABLED_DEFAULT)
    else:
        return False
Example #21
0
def get_train_batch_size(param_dict):
    return get_scalar_param(param_dict, TRAIN_BATCH_SIZE,
                            TRAIN_BATCH_SIZE_DEFAULT)
Example #22
0
def get_gradient_clipping(param_dict):
    return get_scalar_param(param_dict, GRADIENT_CLIPPING,
                            GRADIENT_CLIPPING_DEFAULT)
Example #23
0
def get_dump_state(param_dict):
    return get_scalar_param(param_dict, DUMP_STATE, DUMP_STATE_DEFAULT)
Example #24
0
def get_disable_allgather(param_dict):
    return get_scalar_param(param_dict, DISABLE_ALLGATHER,
                            DISABLE_ALLGATHER_DEFAULT)
Example #25
0
def get_zero_optimization(param_dict):
    return get_scalar_param(param_dict, ZERO_OPTIMIZATION,
                            ZERO_OPTIMIZATION_DEFAULT)
Example #26
0
def get_wall_clock_breakdown(param_dict):
    return get_scalar_param(param_dict, WALL_CLOCK_BREAKDOWN,
                            WALL_CLOCK_BREAKDOWN_DEFAULT)
Example #27
0
def get_zero_reduce_scatter(param_dict):
    return get_scalar_param(param_dict, ZERO_REDUCE_SCATTER,
                            ZERO_REDUCE_SCATTER_DEFAULT)
Example #28
0
def get_memory_breakdown(param_dict):
    return get_scalar_param(param_dict, MEMORY_BREAKDOWN,
                            MEMORY_BREAKDOWN_DEFAULT)
Example #29
0
def get_zero_allow_untested_optimizer(param_dict):
    return get_scalar_param(param_dict, ZERO_ALLOW_UNTESTED_OPTIMIZER,
                            ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT)
Example #30
0
def get_gradient_predivide_factor(param_dict):
    return get_scalar_param(param_dict, GRADIENT_PREDIVIDE_FACTOR,
                            GRADIENT_PREDIVIDE_FACTOR_DEFAULT)