def _prepare_data( self, data_name: str, x_data: DataFrame, y_data: Optional[DataFrame] = None, s3_input_type: bool = True, ) -> Union[TrainingInput, str]: """ Prepares the data to use in the learner. Arguments: x_data: the features of the data y_data: (optional) the output of the data. Don't provide for predictions Returns: The s3 input or s3 location of the data """ LOGGER.info("Preparing data for usage") # Try to get from cache return_data = None if data_name != "predict": return_data = self.data.get_from_cache("s3", data_name) if return_data is not None: LOGGER.info("Found s3 data in cache.") if s3_input_type: return TrainingInput(return_data, content_type="text/csv") return return_data # Try to get local data from cache temp_location = None if data_name != "predict": temp_location = self.data.get_from_cache("local", data_name) # If not available, save to local if temp_location is None: if not os.path.exists(self.local_save_folder): os.makedirs(self.local_save_folder) temp_location = f"{self.local_save_folder}/{data_name}.csv" if y_data is not None: data = pd.concat([y_data, x_data], axis=1) else: data = x_data LOGGER.info("Writing data to local machine") data.to_csv(temp_location, index=False, header=False) else: # Log if present LOGGER.info("Found local data location in cache.") # Upload to S3 return_data = self.executor.upload_data(temp_location, prefix=self.input_data_prefix) # Put in cache if data_name != "predict": self.data.add_to_cache("s3", data_name, return_data) if s3_input_type: return_data = TrainingInput(return_data, content_type="text/csv") return return_data
def test_dict_of_mixed_input_types(): input_list = _Job._format_inputs_to_input_config({ "a": "s3://foo/bar", "b": TrainingInput("s3://whizz/bang") }) expected = [ { "ChannelName": "a", "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": "s3://foo/bar", } }, }, { "ChannelName": "b", "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": "s3://whizz/bang", } }, }, ] # convert back into map for comparison so list order (which is arbitrary) is ignored assert {c["ChannelName"]: c for c in input_list} == {c["ChannelName"]: c for c in expected}
def test_training_input_all_arguments(): prefix = "pre" distribution = "FullyReplicated" compression = "Gzip" content_type = "text/csv" record_wrapping = "RecordIO" s3_data_type = "Manifestfile" input_mode = "Pipe" result = TrainingInput( s3_data=prefix, distribution=distribution, compression=compression, input_mode=input_mode, content_type=content_type, record_wrapping=record_wrapping, s3_data_type=s3_data_type, ) expected = { "DataSource": { "S3DataSource": { "S3DataDistributionType": distribution, "S3DataType": s3_data_type, "S3Uri": prefix, } }, "CompressionType": compression, "ContentType": content_type, "RecordWrapperType": record_wrapping, "InputMode": input_mode, } assert result.config == expected
def test_format_inputs_to_input_config_training_input(): inputs = TrainingInput(BUCKET_NAME) channels = _Job._format_inputs_to_input_config(inputs) assert (channels[0]["DataSource"]["S3DataSource"]["S3Uri"] == inputs.config["DataSource"]["S3DataSource"]["S3Uri"])
def test_format_string_uri_input(): inputs = TrainingInput(BUCKET_NAME) s3_uri_input = _Job._format_string_uri_input(inputs) assert (s3_uri_input.config["DataSource"]["S3DataSource"]["S3Uri"] == inputs.config["DataSource"]["S3DataSource"]["S3Uri"])
def main(): print('Starting model training.') print('Note: if launching for the first time in local mode, container image download might take a few minutes to complete.') hyperparameters = { "max_depth": "5", "eta": "0.2", "gamma": "4", "min_child_weight": "6", "subsample": "0.7", "objective": "reg:squarederror", "num_round": "50", "verbosity": "2", } xgb_script_mode_estimator = XGBoost( entry_point="./code/abalone.py", hyperparameters=hyperparameters, role=DUMMY_IAM_ROLE, instance_count=1, instance_type='local', framework_version="1.2-1" ) train_input = TrainingInput("file://./data/train/abalone", content_type="text/libsvm") xgb_script_mode_estimator.fit({"train": train_input, "validation": train_input}) print('Completed model training') model_data = xgb_script_mode_estimator.model_data print(model_data) xgb_inference_model = XGBoostModel( model_data=model_data, role=DUMMY_IAM_ROLE, entry_point="./code/inference.py", framework_version="1.2-1", ) print('Deploying endpoint in local mode') predictor = xgb_inference_model.deploy( initial_instance_count=1, instance_type="local", ) a_young_abalone = "6 1:3 2:0.37 3:0.29 4:0.095 5:0.249 6:0.1045 7:0.058 8:0.067" do_inference_on_local_endpoint(predictor, a_young_abalone) an_old_abalone = "15 1:1 2:0.655 3:0.53 4:0.175 5:1.2635 6:0.486 7:0.2635 8:0.415" do_inference_on_local_endpoint(predictor, an_old_abalone) print('About to delete the endpoint to stop paying (if in cloud mode).') predictor.delete_endpoint(predictor.endpoint_name)
def test_training_input_all_defaults(caplog): prefix = "pre" actual = TrainingInput(s3_data=prefix) expected = { "DataSource": { "S3DataSource": { "S3DataDistributionType": "FullyReplicated", "S3DataType": "S3Prefix", "S3Uri": prefix, } } } assert actual.config == expected
def test_load_config(estimator): inputs = TrainingInput(BUCKET_NAME) config = _Job._load_config(inputs, estimator) assert config["input_config"][0]["DataSource"]["S3DataSource"][ "S3Uri"] == BUCKET_NAME assert config["role"] == ROLE assert config["output_config"]["S3OutputPath"] == S3_OUTPUT_PATH assert "KmsKeyId" not in config["output_config"] assert config["resource_config"]["InstanceCount"] == INSTANCE_COUNT assert config["resource_config"]["InstanceType"] == INSTANCE_TYPE assert config["resource_config"]["VolumeSizeInGB"] == VOLUME_SIZE assert config["stop_condition"]["MaxRuntimeInSeconds"] == MAX_RUNTIME
def test_load_config_with_code_channel_no_code_uri(framework): inputs = TrainingInput(BUCKET_NAME) framework.model_uri = MODEL_URI framework.model_channel_name = MODEL_CHANNEL_NAME framework._enable_network_isolation = True config = _Job._load_config(inputs, framework) assert len(config["input_config"]) == 2 assert config["input_config"][0]["DataSource"]["S3DataSource"][ "S3Uri"] == BUCKET_NAME assert config["role"] == ROLE assert config["output_config"]["S3OutputPath"] == S3_OUTPUT_PATH assert "KmsKeyId" not in config["output_config"] assert config["resource_config"]["InstanceCount"] == INSTANCE_COUNT assert config["resource_config"]["InstanceType"] == INSTANCE_TYPE
def run_test(ecr_image, sagemaker_session, instance_type, framework_version, test_data, record_wrapper_type=None): source_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources', 'pipemode') script = os.path.join(source_path, 'pipemode.py') estimator = TensorFlow(entry_point=script, role='SageMakerRole', instance_type=instance_type, instance_count=1, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, input_mode='Pipe', hyperparameters={'dimension': DIMENSION}) input = TrainingInput(s3_data=test_data(sagemaker_session), distribution='FullyReplicated', record_wrapping=record_wrapper_type, input_mode='Pipe') with timeout(minutes=20): estimator.fit({'elizabeth': input}, job_name=unique_name_from_base('test-sagemaker-pipemode'))
def test_format_input_training_input(): input_dict = _Job._format_inputs_to_input_config( TrainingInput( "s3://foo/bar", distribution="ShardedByS3Key", compression="gzip", content_type="whizz", record_wrapping="bang", )) assert input_dict == [{ "CompressionType": "gzip", "ChannelName": "training", "ContentType": "whizz", "DataSource": { "S3DataSource": { "S3DataType": "S3Prefix", "S3DataDistributionType": "ShardedByS3Key", "S3Uri": "s3://foo/bar", } }, "RecordWrapperType": "bang", }]
def test_conditional_pytorch_training_model_registration( sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, ): base_dir = os.path.join(DATA_DIR, "pytorch_mnist") entry_point = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) inputs = TrainingInput(s3_data=input_path) instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") good_enough_input = ParameterInteger(name="GoodEnoughInput", default_value=1) in_condition_input = ParameterString(name="Foo", default_value="Foo") pytorch_estimator = PyTorch( entry_point=entry_point, role=role, framework_version="1.5.0", py_version="py3", instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, ) step_train = TrainingStep( name="pytorch-train", estimator=pytorch_estimator, inputs=inputs, ) step_register = RegisterModel( name="pytorch-register-model", estimator=pytorch_estimator, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["*"], response_types=["*"], inference_instances=["*"], transform_instances=["*"], description="test-description", ) model = Model( image_uri=pytorch_estimator.training_image_uri(), model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, sagemaker_session=sagemaker_session, role=role, ) model_inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_model = CreateModelStep( name="pytorch-model", model=model, inputs=model_inputs, ) step_cond = ConditionStep( name="cond-good-enough", conditions=[ ConditionGreaterThanOrEqualTo(left=good_enough_input, right=1), ConditionIn(value=in_condition_input, in_values=["foo", "bar"]), ], if_steps=[step_train, step_register], else_steps=[step_model], ) pipeline = Pipeline( name=pipeline_name, parameters=[ in_condition_input, good_enough_input, instance_count, instance_type, ], steps=[step_cond], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) execution = pipeline.start(parameters={"GoodEnoughInput": 0}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) finally: try: pipeline.delete() except Exception: pass
def test_tuning_multi_algos( sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, script_dir, athena_dataset_definition, ): base_dir = os.path.join(DATA_DIR, "pytorch_mnist") entry_point = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv" sklearn_processor = SKLearnProcessor( framework_version="0.20.0", instance_type=instance_type, instance_count=instance_count, base_job_name="test-sklearn", sagemaker_session=sagemaker_session, role=role, ) property_file = PropertyFile(name="DataAttributes", output_name="attributes", path="attributes.json") step_process = ProcessingStep( name="my-process", display_name="ProcessingStep", description="description for Processing step", processor=sklearn_processor, inputs=[ ProcessingInput(source=input_data, destination="/opt/ml/processing/input"), ProcessingInput(dataset_definition=athena_dataset_definition), ], outputs=[ ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"), ProcessingOutput(output_name="attributes", source="/opt/ml/processing/attributes.json"), ], property_files=[property_file], code=os.path.join(script_dir, "preprocessing.py"), ) static_hp_1 = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") json_get_hp = JsonGet(step_name=step_process.name, property_file=property_file, json_path="train_size") pytorch_estimator = PyTorch( entry_point=entry_point, role=role, framework_version="1.5.0", py_version="py3", instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, enable_sagemaker_metrics=True, max_retry_attempts=3, hyperparameters={ "static-hp": static_hp_1, "train_size": json_get_hp }, ) min_batch_size = ParameterString(name="MinBatchSize", default_value="64") max_batch_size = json_get_hp tuner = HyperparameterTuner.create( estimator_dict={ "estimator-1": pytorch_estimator, "estimator-2": pytorch_estimator, }, objective_metric_name_dict={ "estimator-1": "test:acc", "estimator-2": "test:acc", }, hyperparameter_ranges_dict={ "estimator-1": { "batch-size": IntegerParameter(min_batch_size, max_batch_size) }, "estimator-2": { "batch-size": IntegerParameter(min_batch_size, max_batch_size) }, }, metric_definitions_dict={ "estimator-1": [{ "Name": "test:acc", "Regex": "Overall test accuracy: (.*?);" }], "estimator-2": [{ "Name": "test:acc", "Regex": "Overall test accuracy: (.*?);" }], }, ) inputs = { "estimator-1": TrainingInput(s3_data=input_path), "estimator-2": TrainingInput(s3_data=input_path), } step_tune = TuningStep( name="my-tuning-step", tuner=tuner, inputs=inputs, ) pipeline = Pipeline( name=pipeline_name, parameters=[ instance_count, instance_type, min_batch_size, max_batch_size ], steps=[step_process, step_tune], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) finally: try: pipeline.delete() except Exception: pass
def s3_test_in(self): return TrainingInput(s3_data=s3_test_loc.as_uri(), content_type='csv')
def test_tuning_single_algo( sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, ): base_dir = os.path.join(DATA_DIR, "pytorch_mnist") entry_point = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) inputs = TrainingInput(s3_data=input_path) instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") pytorch_estimator = PyTorch( entry_point=entry_point, role=role, framework_version="1.5.0", py_version="py3", instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, enable_sagemaker_metrics=True, max_retry_attempts=3, ) min_batch_size = ParameterInteger(name="MinBatchSize", default_value=64) max_batch_size = ParameterInteger(name="MaxBatchSize", default_value=128) hyperparameter_ranges = { "batch-size": IntegerParameter(min_batch_size, max_batch_size), } tuner = HyperparameterTuner( estimator=pytorch_estimator, objective_metric_name="test:acc", objective_type="Maximize", hyperparameter_ranges=hyperparameter_ranges, metric_definitions=[{ "Name": "test:acc", "Regex": "Overall test accuracy: (.*?);" }], max_jobs=2, max_parallel_jobs=2, ) step_tune = TuningStep( name="my-tuning-step", tuner=tuner, inputs=inputs, ) best_model = Model( image_uri=pytorch_estimator.training_image_uri(), model_data=step_tune.get_top_model_s3_uri( top_k=0, s3_bucket=sagemaker_session.default_bucket(), ), sagemaker_session=sagemaker_session, role=role, ) model_inputs = CreateModelInput( instance_type="ml.m5.large", accelerator_type="ml.eia1.medium", ) step_best_model = CreateModelStep( name="1st-model", model=best_model, inputs=model_inputs, ) second_best_model = Model( image_uri=pytorch_estimator.training_image_uri(), model_data=step_tune.get_top_model_s3_uri( top_k=1, s3_bucket=sagemaker_session.default_bucket(), ), sagemaker_session=sagemaker_session, role=role, entry_point=entry_point, source_dir=base_dir, ) step_second_best_model = CreateModelStep( name="2nd-best-model", model=second_best_model, inputs=model_inputs, ) pipeline = Pipeline( name=pipeline_name, parameters=[ instance_count, instance_type, min_batch_size, max_batch_size ], steps=[step_tune, step_best_model, step_second_best_model], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) for _ in retries( max_retry_count=5, exception_message_prefix= "Waiting for a successful execution of pipeline", seconds_to_sleep=10, ): execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) try: execution.wait(delay=30, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 3 for step in execution_steps: assert step["StepStatus"] == "Succeeded" break finally: try: pipeline.delete() except Exception: pass
def s3_val_in(self): return TrainingInput("s3://{}/{}".format(s3_data_loc, "/val.csv"), content_type='csv')
classifier.set_hyperparameters(num_layers=152, use_pretrained_model=0, image_shape='3,224,224', num_classes=2, mini_batch_size=32, epochs=30, learning_rate=0.01, num_training_samples=963, precision_dtype='float32') #Training channels from sagemaker import s3train train_data = TrainingInput(s3train, distribution='FullyReplicated', content_type='application/x-image', s3_data_type='S3Prefix') validation_data = TrainingInput(s3validation, distribution='FullyReplicated', content_type='application/x-image', s3_data_type='S3Prefix') train_data_lst = TrainingInput(s3train_lst, distribution='FullyReplicated', content_type='application/x-image', s3_data_type='S3Prefix') validation_data_lst = TrainingInput(s3validation_lst, distribution='FullyReplicated', content_type='application/x-image', s3_data_type='S3Prefix') data_channels = {
def test_sklearn_xgboost_sip_model_registration(sagemaker_session, role, pipeline_name, region_name): prefix = "sip" bucket_name = sagemaker_session.default_bucket() instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") sklearn_processor = SKLearnProcessor( role=role, instance_type=instance_type, instance_count=instance_count, framework_version="0.20.0", sagemaker_session=sagemaker_session, ) # The path to the raw data. raw_data_path = "s3://{0}/{1}/data/raw/".format(bucket_name, prefix) raw_data_path_param = ParameterString(name="raw_data_path", default_value=raw_data_path) # The output path to the training data. train_data_path = "s3://{0}/{1}/data/preprocessed/train/".format( bucket_name, prefix) train_data_path_param = ParameterString(name="train_data_path", default_value=train_data_path) # The output path to the validation data. val_data_path = "s3://{0}/{1}/data/preprocessed/val/".format( bucket_name, prefix) val_data_path_param = ParameterString(name="val_data_path", default_value=val_data_path) # The training output path for the model. output_path = "s3://{0}/{1}/output/".format(bucket_name, prefix) output_path_param = ParameterString(name="output_path", default_value=output_path) # The output path to the featurizer model. model_path = "s3://{0}/{1}/output/sklearn/".format(bucket_name, prefix) model_path_param = ParameterString(name="model_path", default_value=model_path) inputs = [ ProcessingInput( input_name="raw_data", source=raw_data_path_param, destination="/opt/ml/processing/input", ) ] outputs = [ ProcessingOutput( output_name="train_data", source="/opt/ml/processing/train", destination=train_data_path_param, ), ProcessingOutput( output_name="val_data", source="/opt/ml/processing/val", destination=val_data_path_param, ), ProcessingOutput( output_name="model", source="/opt/ml/processing/model", destination=model_path_param, ), ] base_dir = os.path.join(DATA_DIR, "sip") code_path = os.path.join(base_dir, "preprocessor.py") processing_step = ProcessingStep( name="Processing", code=code_path, processor=sklearn_processor, inputs=inputs, outputs=outputs, job_arguments=["--train-test-split-ratio", "0.2"], ) entry_point = "training.py" source_dir = base_dir code_location = "s3://{0}/{1}/code".format(bucket_name, prefix) estimator = XGBoost( entry_point=entry_point, source_dir=source_dir, output_path=output_path_param, code_location=code_location, instance_type=instance_type, instance_count=instance_count, framework_version="0.90-2", sagemaker_session=sagemaker_session, py_version="py3", role=role, ) training_step = TrainingStep( name="Training", estimator=estimator, inputs={ "train": TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs["train_data"].S3Output.S3Uri, content_type="text/csv", ), "validation": TrainingInput( s3_data=processing_step.properties.ProcessingOutputConfig. Outputs["val_data"].S3Output.S3Uri, content_type="text/csv", ), }, ) code_location = "s3://{0}/{1}/code".format(bucket_name, prefix) source_dir = os.path.join(base_dir, "sklearn_source_dir") sklearn_model = SKLearnModel( name="sklearn-model", model_data=processing_step.properties.ProcessingOutputConfig. Outputs["model"].S3Output.S3Uri, entry_point="inference.py", source_dir=source_dir, code_location=code_location, role=role, sagemaker_session=sagemaker_session, framework_version="0.20.0", py_version="py3", ) code_location = "s3://{0}/{1}/code".format(bucket_name, prefix) source_dir = os.path.join(base_dir, "xgboost_source_dir") xgboost_model = XGBoostModel( name="xgboost-model", model_data=training_step.properties.ModelArtifacts.S3ModelArtifacts, entry_point="inference.py", source_dir=source_dir, code_location=code_location, framework_version="0.90-2", py_version="py3", role=role, sagemaker_session=sagemaker_session, ) pipeline_model = PipelineModel([xgboost_model, sklearn_model], role, sagemaker_session=sagemaker_session) step_register = RegisterModel( name="AbaloneRegisterModel", model=pipeline_model, content_types=["application/json"], response_types=["application/json"], inference_instances=["ml.t2.medium", "ml.m5.xlarge"], transform_instances=["ml.m5.xlarge"], model_package_group_name="windturbine", ) pipeline = Pipeline( name=pipeline_name, parameters=[ raw_data_path_param, train_data_path_param, val_data_path_param, model_path_param, instance_type, instance_count, output_path_param, ], steps=[processing_step, training_step, step_register], sagemaker_session=sagemaker_session, ) try: response = pipeline.upsert(role_arn=role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) execution = pipeline.start() assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) finally: try: pipeline.delete() except Exception: pass
def test_training_job_with_debugger_and_profiler( sagemaker_session, pipeline_name, role, pytorch_training_latest_version, pytorch_training_latest_py_version, ): instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") rules = [ Rule.sagemaker(rule_configs.vanishing_gradient()), Rule.sagemaker(base_config=rule_configs.all_zero(), rule_parameters={"tensor_regex": ".*"}), Rule.sagemaker(rule_configs.loss_not_decreasing()), ] debugger_hook_config = DebuggerHookConfig( s3_output_path=(f"s3://{sagemaker_session.default_bucket()}/{uuid.uuid4()}/tensors") ) base_dir = os.path.join(DATA_DIR, "pytorch_mnist") script_path = os.path.join(base_dir, "mnist.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "training"), key_prefix="integ-test-data/pytorch_mnist/training", ) inputs = TrainingInput(s3_data=input_path) pytorch_estimator = PyTorch( entry_point=script_path, role="SageMakerRole", framework_version=pytorch_training_latest_version, py_version=pytorch_training_latest_py_version, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, rules=rules, debugger_hook_config=debugger_hook_config, ) step_train = TrainingStep( name="pytorch-train", estimator=pytorch_estimator, inputs=inputs, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[step_train], sagemaker_session=sagemaker_session, ) for _ in retries( max_retry_count=5, exception_message_prefix="Waiting for a successful execution of pipeline", seconds_to_sleep=10, ): try: response = pipeline.create(role) create_arn = response["PipelineArn"] execution = pipeline.start() response = execution.describe() assert response["PipelineArn"] == create_arn try: execution.wait(delay=10, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 failure_reason = execution_steps[0].get("FailureReason", "") if failure_reason != "": logging.error(f"Pipeline execution failed with error: {failure_reason}.Retrying..") continue assert execution_steps[0]["StepName"] == "pytorch-train" assert execution_steps[0]["StepStatus"] == "Succeeded" training_job_arn = execution_steps[0]["Metadata"]["TrainingJob"]["Arn"] job_description = sagemaker_session.sagemaker_client.describe_training_job( TrainingJobName=training_job_arn.split("/")[1] ) for index, rule in enumerate(rules): config = job_description["DebugRuleConfigurations"][index] assert config["RuleConfigurationName"] == rule.name assert config["RuleEvaluatorImage"] == rule.image_uri assert config["VolumeSizeInGB"] == 0 assert ( config["RuleParameters"]["rule_to_invoke"] == rule.rule_parameters["rule_to_invoke"] ) assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict() assert job_description["ProfilingStatus"] == "Enabled" assert job_description["ProfilerConfig"]["ProfilingIntervalInMilliseconds"] == 500 break finally: try: pipeline.delete() except Exception: pass
def test_model_registration_with_tensorflow_model_with_pipeline_model( sagemaker_session, role, tf_full_version, tf_full_py_version, pipeline_name, region_name): base_dir = os.path.join(DATA_DIR, "tensorflow_mnist") entry_point = os.path.join(base_dir, "mnist_v2.py") input_path = sagemaker_session.upload_data( path=os.path.join(base_dir, "data"), key_prefix="integ-test-data/tf-scriptmode/mnist/training", ) inputs = TrainingInput(s3_data=input_path) instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") tensorflow_estimator = TensorFlow( entry_point=entry_point, role=role, instance_count=instance_count, instance_type=instance_type, framework_version=tf_full_version, py_version=tf_full_py_version, sagemaker_session=sagemaker_session, ) step_train = TrainingStep( name="MyTrain", estimator=tensorflow_estimator, inputs=inputs, ) model = TensorFlowModel( entry_point=entry_point, framework_version="2.4", model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, role=role, sagemaker_session=sagemaker_session, ) pipeline_model = PipelineModel(name="MyModelPipeline", models=[model], role=role, sagemaker_session=sagemaker_session) step_register_model = RegisterModel( name="MyRegisterModel", model=pipeline_model, model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts, content_types=["application/json"], response_types=["application/json"], inference_instances=["ml.t2.medium", "ml.m5.large"], transform_instances=["ml.m5.large"], model_package_group_name=f"{pipeline_name}TestModelPackageGroup", ) pipeline = Pipeline( name=pipeline_name, parameters=[ instance_count, instance_type, ], steps=[step_train, step_register_model], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) for _ in retries( max_retry_count=5, exception_message_prefix= "Waiting for a successful execution of pipeline", seconds_to_sleep=10, ): execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) try: execution.wait(delay=30, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 3 for step in execution_steps: assert step["StepStatus"] == "Succeeded" break finally: try: pipeline.delete() except Exception: pass
def test_training_step_with_output_path_as_join( sagemaker_session, role, tf_full_version, tf_full_py_version, pipeline_name, region_name ): base_dir = os.path.join(DATA_DIR, "dummy_tensor") input_path = sagemaker_session.upload_data( path=base_dir, key_prefix="integ-test-data/estimator/training" ) inputs = TrainingInput(s3_data=input_path) instance_count = ParameterInteger(name="InstanceCount", default_value=1) instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge") output_path = Join( on="/", values=["s3:/", f"{sagemaker_session.default_bucket()}", f"{pipeline_name}Train"] ) image_uri = image_uris.retrieve("factorization-machines", sagemaker_session.boto_region_name) estimator = Estimator( image_uri=image_uri, role=role, instance_count=instance_count, instance_type=instance_type, sagemaker_session=sagemaker_session, output_path=output_path, ) estimator.set_hyperparameters( num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type="binary_classifier" ) step_train = TrainingStep( name="MyTrain", estimator=estimator, inputs=inputs, ) pipeline = Pipeline( name=pipeline_name, parameters=[instance_count, instance_type], steps=[step_train], sagemaker_session=sagemaker_session, ) try: response = pipeline.create(role) create_arn = response["PipelineArn"] assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn, ) execution = pipeline.start(parameters={}) assert re.match( rf"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/", execution.arn, ) try: execution.wait(delay=30, max_attempts=60) except WaiterError: pass execution_steps = execution.list_steps() assert len(execution_steps) == 1 assert execution_steps[0]["StepName"] == "MyTrain" finally: try: pipeline.delete() except Exception: pass