Exemple #1
0
def test_backward(batch_size,
                  hidden_size,
                  seq_len,
                  heads,
                  num_layers,
                  is_preln,
                  use_fp16,
                  atol):
    # Only run fp16 test cases on devices with 7+ capability.
    major, _ = torch.cuda.get_device_capability()
    if major < 7 and (use_fp16 is True or is_preln is False):
        return

    ds_config = DeepSpeedTransformerConfig()
    ds_config.layer_id = None
    ds_config.batch_size = batch_size
    ds_config.hidden_size = hidden_size
    ds_config.max_seq_length = seq_len
    ds_config.heads = heads
    ds_config.attn_dropout_ratio = 0.0
    ds_config.hidden_dropout_ratio = 0.0
    ds_config.num_hidden_layers = num_layers
    ds_config.pre_layer_norm = is_preln
    ds_config.initializer_range = 0.02
    ds_config.fp16 = use_fp16

    run_backward(ds_config, atol=atol)
def test_forward_stochastic(batch_size,
                            hidden_size,
                            seq_len,
                            heads,
                            num_layers,
                            is_preln,
                            use_fp16):
    # Only run fp16 test cases on devices with 7+ capability.
    major, _ = torch.cuda.get_device_capability()
    if major < 7 and use_fp16 is True:
        return

    ds_config = DeepSpeedTransformerConfig()
    ds_config.layer_id = None
    ds_config.batch_size = batch_size
    ds_config.hidden_size = hidden_size
    ds_config.intermediate_size = 4 * hidden_size
    ds_config.max_seq_length = seq_len
    ds_config.heads = heads
    ds_config.attn_dropout_ratio = 0.0
    ds_config.hidden_dropout_ratio = 0.0
    ds_config.num_hidden_layers = num_layers
    ds_config.pre_layer_norm = is_preln
    ds_config.initializer_range = 0.02
    ds_config.fp16 = use_fp16
    ds_config.stochastic_mode = True

    run_forward(ds_config, seq_len, atol=7e-2)