def test_disable_compiler_config(
    time,
    name_from_base,
    sagemaker_session,
    huggingface_training_compiler_version,
    huggingface_training_compiler_pytorch_version,
):
    compiler_config = TrainingCompilerConfig(enabled=False)

    hf = HuggingFace(
        py_version="py38",
        entry_point=SCRIPT_PATH,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE,
        transformers_version=huggingface_training_compiler_version,
        pytorch_version=huggingface_training_compiler_pytorch_version,
        enable_sagemaker_metrics=False,
        compiler_config=TrainingCompilerConfig(enabled=False),
    )

    inputs = "s3://mybucket/train"

    hf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG)

    sagemaker_call_names = [c[0] for c in sagemaker_session.method_calls]
    assert sagemaker_call_names == ["train", "logs_for_job"]
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ["resource"]

    expected_train_args = _create_train_job(
        huggingface_training_compiler_version,
        f"pytorch{huggingface_training_compiler_pytorch_version}",
        INSTANCE_TYPE,
        compiler_config,
    )
    expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][
        "S3Uri"] = inputs
    expected_train_args["enable_sagemaker_metrics"] = False
    expected_train_args["hyperparameters"][
        TrainingCompilerConfig.HP_ENABLE_COMPILER] = json.dumps(False)
    expected_train_args["hyperparameters"][
        TrainingCompilerConfig.HP_ENABLE_DEBUG] = json.dumps(False)

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert (
        actual_train_args == expected_train_args
    ), f"{json.dumps(actual_train_args, indent=2)} != {json.dumps(expected_train_args, indent=2)}"
Example #2
0
    def test_trcomp_enabled(self, patched, sagemaker_session, ecr_image,
                            tmpdir, capsys):
        '''
        Tests the explicit enabled configuration of SM trcomp
        '''
        instance_type = "ml.p3.2xlarge"
        instance_count = 1

        estimator = HuggingFace(
            compiler_config=TrainingCompilerConfig(enabled=True),
            entry_point="train.py",
            source_dir=BERT_PATH,
            role="SageMakerRole",
            instance_type=instance_type,
            instance_count=instance_count,
            image_uri=ecr_image,
            py_version=py_version,
            sagemaker_session=sagemaker_session,
            hyperparameters=hyperparameters,
            debugger_hook_config=False,  # currently needed
            max_retry_attempts=15,
        )

        estimator.fit(
            job_name=unique_name_from_base("hf-tf-trcomp-single-gpu-enabled"),
            logs=True)

        captured = capsys.readouterr()
        logs = captured.out + captured.err
        assert "Found configuration for Training Compiler" in logs
Example #3
0
def test_single_node_single_gpu_tcc_default(patched, docker_image, processor,
                                            instance_type,
                                            sagemaker_local_session,
                                            py_version, capsys):
    '''
    Single GPU test that tests the local_gpu instance type with default TCC.
    All local mode tests (PT and TF) are run serially on a single instance.
    '''
    hyperparameters = {
        "max_steps": 3,
        "train_batch_size": 4,
        "model_name": "distilbert-base-uncased"
    }

    estimator = HuggingFace(
        compiler_config=TrainingCompilerConfig(),
        entry_point=distrilbert_script,
        instance_type="local_gpu",
        sagemaker_session=sagemaker_local_session,
        image_uri=docker_image,
        instance_count=1,
        role=ROLE,
        hyperparameters=hyperparameters,
        environment={
            'GPU_NUM_DEVICES': '1'
        },  #https://github.com/aws/sagemaker-training-toolkit/issues/107
        py_version=py_version,
    )

    estimator.fit()
Example #4
0
    def test_trcomp_debug(self, patched, ecr_image, sagemaker_session, tmpdir,
                          py_version, capsys):
        '''
        Tests the debug mode configuration of SM trcomp
        '''
        transformers_version = get_transformers_version(ecr_image)
        git_config = {
            'repo': 'https://github.com/huggingface/transformers.git',
            'branch': 'v' + transformers_version
        }

        instance_count = 1
        instance_type = "ml.p3.2xlarge"

        source_dir = ("./examples/question-answering"
                      if Version(transformers_version) < Version("4.6") else
                      "./examples/pytorch/question-answering")

        with timeout(minutes=DEFAULT_TIMEOUT):
            estimator = HuggingFace(
                compiler_config=TrainingCompilerConfig(debug=True),
                entry_point='run_qa.py',
                source_dir=source_dir,
                git_config=git_config,
                metric_definitions=metric_definitions,
                role='SageMakerRole',
                image_uri=ecr_image,
                instance_count=instance_count,
                instance_type=instance_type,
                sagemaker_session=sagemaker_session,
                hyperparameters=hyperparameters,
                environment={'GPU_NUM_DEVICES': '1'},
                py_version=py_version,
                max_retry_attempts=15,
            )
            estimator.fit(job_name=sagemaker.utils.unique_name_from_base(
                'hf-pt-trcomp-single-gpu-debug'),
                          logs=True)

        captured = capsys.readouterr()
        logs = captured.out + captured.err
        assert "Found configuration for Training Compiler" in logs
        assert "Training Compiler set to debug mode" in logs
        assert "Configuring SM Training Compiler" in logs
        assert "device: xla" in logs

        debug_artifact_path = estimator.model_data.replace(
            'model.tar.gz', 'output.tar.gz')
        debug_artifact = os.path.join(tmpdir, 'output.tar.gz')
        subprocess.check_output(
            ['aws', 's3', 'cp', debug_artifact_path, debug_artifact])
        with tarfile.open(debug_artifact, 'r:gz') as tarball:
            tarball.extractall(path=tmpdir)
        xla_metrics_file = os.path.join(tmpdir, 'compiler',
                                        'XLA_METRICS_FILE.txt')
        assert os.path.exists(xla_metrics_file)
def test_unsupported_framework_mxnet(huggingface_training_compiler_version, ):
    with pytest.raises(ValueError):
        HuggingFace(
            py_version="py38",
            entry_point=SCRIPT_PATH,
            role=ROLE,
            instance_count=INSTANCE_COUNT,
            instance_type=INSTANCE_TYPE,
            transformers_version=huggingface_training_compiler_version,
            mxnet_version=".".join(
                ["99"] *
                len(huggingface_training_compiler_version.split("."))),
            enable_sagemaker_metrics=False,
            compiler_config=TrainingCompilerConfig(),
        ).fit()
def test_unsupported_python_2(
    huggingface_training_compiler_version,
    huggingface_training_compiler_pytorch_version,
):
    with pytest.raises(ValueError):
        HuggingFace(
            py_version="py27",
            entry_point=SCRIPT_PATH,
            role=ROLE,
            instance_count=INSTANCE_COUNT,
            instance_type=INSTANCE_TYPE,
            transformers_version=huggingface_training_compiler_version,
            pytorch_version=huggingface_training_compiler_pytorch_version,
            enable_sagemaker_metrics=False,
            compiler_config=TrainingCompilerConfig(),
        ).fit()
Example #7
0
    def test_trcomp_enabled(self, patched, ecr_image, sagemaker_session,
                            tmpdir, py_version, capsys):
        '''
        Tests the explicit enabled configuration of SM trcomp
        '''
        transformers_version = get_transformers_version(ecr_image)
        git_config = {
            'repo': 'https://github.com/huggingface/transformers.git',
            'branch': 'v' + transformers_version
        }

        instance_count = 1
        instance_type = "ml.p3.2xlarge"

        source_dir = ("./examples/question-answering"
                      if Version(transformers_version) < Version("4.6") else
                      "./examples/pytorch/question-answering")

        with timeout(minutes=DEFAULT_TIMEOUT):
            estimator = HuggingFace(
                compiler_config=TrainingCompilerConfig(enabled=True),
                entry_point='run_qa.py',
                source_dir=source_dir,
                git_config=git_config,
                metric_definitions=metric_definitions,
                role='SageMakerRole',
                image_uri=ecr_image,
                instance_count=instance_count,
                instance_type=instance_type,
                sagemaker_session=sagemaker_session,
                hyperparameters=hyperparameters,
                environment={'GPU_NUM_DEVICES': '1'},
                py_version=py_version,
                max_retry_attempts=15,
            )
            estimator.fit(job_name=sagemaker.utils.unique_name_from_base(
                'hf-pt-trcomp-single-gpu-enabled'),
                          logs=True)
        captured = capsys.readouterr()
        logs = captured.out + captured.err
        assert "Found configuration for Training Compiler" in logs
        assert "Configuring SM Training Compiler" in logs
        assert "device: xla" in logs
Example #8
0
def test_huggingface_pytorch(
    sagemaker_session,
    gpu_instance_type,
    huggingface_training_compiler_latest_version,
    huggingface_training_compiler_pytorch_latest_version,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "huggingface")

        hf = HuggingFace(
            py_version="py38",
            entry_point=os.path.join(data_path, "run_glue.py"),
            role="SageMakerRole",
            transformers_version=huggingface_training_compiler_latest_version,
            pytorch_version=
            huggingface_training_compiler_pytorch_latest_version,
            instance_count=1,
            instance_type=gpu_instance_type,
            hyperparameters={
                "model_name_or_path": "distilbert-base-cased",
                "task_name": "wnli",
                "do_train": True,
                "do_eval": True,
                "max_seq_length": 128,
                "fp16": True,
                "per_device_train_batch_size": 128,
                "output_dir": "/opt/ml/model",
            },
            environment={"GPU_NUM_DEVICES": "1"},
            sagemaker_session=sagemaker_session,
            disable_profiler=True,
            compiler_config=TrainingCompilerConfig(),
        )

        train_input = hf.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/huggingface/train",
        )

        hf.fit(train_input)
def test_unsupported_BYOC(
    huggingface_training_compiler_version,
    huggingface_training_compiler_pytorch_version,
):
    byoc = (
        "1.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-trcomp-training:"
        "1.9.0-"
        "transformers4.10.2-gpu-"
        "py38-cu111-ubuntu20.04")
    with pytest.raises(ValueError):
        HuggingFace(
            image_uri=byoc,
            py_version="py38",
            entry_point=SCRIPT_PATH,
            role=ROLE,
            instance_count=INSTANCE_COUNT,
            instance_type=INSTANCE_TYPE,
            transformers_version=huggingface_training_compiler_version,
            pytorch_version=huggingface_training_compiler_pytorch_version,
            enable_sagemaker_metrics=False,
            compiler_config=TrainingCompilerConfig(),
        ).fit()
Example #10
0
def test_huggingface_tensorflow(
    sagemaker_session,
    gpu_instance_type,
    huggingface_training_compiler_latest_version,
    huggingface_training_compiler_tensorflow_latest_version,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "huggingface")

        hf = HuggingFace(
            py_version="py38",
            entry_point=os.path.join(data_path, "run_tf.py"),
            role="SageMakerRole",
            transformers_version=huggingface_training_compiler_latest_version,
            tensorflow_version=
            huggingface_training_compiler_tensorflow_latest_version,
            instance_count=1,
            instance_type=gpu_instance_type,
            hyperparameters={
                "model_name_or_path": "distilbert-base-cased",
                "per_device_train_batch_size": 128,
                "per_device_eval_batch_size": 128,
                "output_dir": "/opt/ml/model",
                "overwrite_output_dir": True,
                "save_steps": 5500,
            },
            sagemaker_session=sagemaker_session,
            disable_profiler=True,
            compiler_config=TrainingCompilerConfig(),
        )

        train_input = hf.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/huggingface/train")

        hf.fit(train_input)