def test_tuning_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type): subnets = [efs_fsx_setup.subnet_id] security_group_ids = efs_fsx_setup.security_group_ids role = efs_fsx_setup.role_name kmeans = KMeans( role=role, train_instance_count=TRAIN_INSTANCE_COUNT, train_instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, security_group_ids=security_group_ids, ) hyperparameter_ranges = { "extra_center_factor": IntegerParameter(4, 10), "mini_batch_size": IntegerParameter(10, 100), "epochs": IntegerParameter(1, 2), "init_method": CategoricalParameter(["kmeans++", "random"]), } with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): tuner = HyperparameterTuner( estimator=kmeans, objective_metric_name=OBJECTIVE_METRIC_NAME, hyperparameter_ranges=hyperparameter_ranges, objective_type="Minimize", max_jobs=MAX_JOBS, max_parallel_jobs=MAX_PARALLEL_JOBS, ) file_system_fsx_id = efs_fsx_setup.file_system_fsx_id train_records = FileSystemRecordSet( file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH, num_records=NUM_RECORDS, feature_dim=FEATURE_DIM, ) test_records = FileSystemRecordSet( file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH, num_records=NUM_RECORDS, feature_dim=FEATURE_DIM, channel="test", ) job_name = unique_name_from_base("tune-kmeans-fsx") tuner.fit([train_records, test_records], job_name=job_name) tuner.wait() best_training_job = tuner.best_training_job() assert best_training_job
def test_file_system_record_set_fsx_customized_parameters(): file_system_id = "fs-0a48d2a1" file_system_type = "FSxLustre" directory_path = "ipinsights" num_records = 1 feature_dim = 1 actual = FileSystemRecordSet( file_system_id=file_system_id, file_system_type=file_system_type, directory_path=directory_path, num_records=num_records, feature_dim=feature_dim, file_system_access_mode="rw", channel="test", ) expected_input_config = { "DataSource": { "FileSystemDataSource": { "DirectoryPath": "ipinsights", "FileSystemId": "fs-0a48d2a1", "FileSystemType": "FSxLustre", "FileSystemAccessMode": "rw", } } } assert actual.file_system_input.config == expected_input_config assert actual.num_records == num_records assert actual.feature_dim == feature_dim assert actual.channel == "test"
def test_file_system_record_set_efs_default_parameters(): file_system_id = "fs-0a48d2a1" file_system_type = "EFS" directory_path = "ipinsights" num_records = 1 feature_dim = 1 actual = FileSystemRecordSet( file_system_id=file_system_id, file_system_type=file_system_type, directory_path=directory_path, num_records=num_records, feature_dim=feature_dim, ) expected_input_config = { "DataSource": { "FileSystemDataSource": { "DirectoryPath": "ipinsights", "FileSystemId": "fs-0a48d2a1", "FileSystemType": "EFS", "FileSystemAccessMode": "ro", } } } assert actual.file_system_input.config == expected_input_config assert actual.num_records == num_records assert actual.feature_dim == feature_dim assert actual.channel == "train"
def test_kmeans_fsx(efs_fsx_setup, sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] kmeans = KMeans( role=role, instance_count=INSTANCE_COUNT, instance_type=cpu_instance_type, k=K, sagemaker_session=sagemaker_session, subnets=subnets, security_group_ids=security_group_ids, ) file_system_fsx_id = efs_fsx_setup["file_system_fsx_id"] records = FileSystemRecordSet( file_system_id=file_system_fsx_id, file_system_type="FSxLustre", directory_path=FSX_DIR_PATH, num_records=NUM_RECORDS, feature_dim=FEATURE_DIM, ) job_name = unique_name_from_base("kmeans-fsx") kmeans.fit(records, job_name=job_name) model_path, _ = kmeans.model_data.rsplit("/", 1) assert_s3_files_exist(sagemaker_session, model_path, ["model.tar.gz"])
def test_file_system_record_set_data_channel(): file_system_id = "fs-0a48d2a1" file_system_type = "EFS" directory_path = "ipinsights" num_records = 1 feature_dim = 1 record_set = FileSystemRecordSet( file_system_id=file_system_id, file_system_type=file_system_type, directory_path=directory_path, num_records=num_records, feature_dim=feature_dim, ) file_system_input = Mock() record_set.file_system_input = file_system_input actual = record_set.data_channel() expected = {"train": file_system_input} assert actual == expected
def test_format_record_set_list_input(): records = FileSystemRecordSet( file_system_id="fs-fd85e556", file_system_type="EFS", directory_path="ipinsights", num_records=100, feature_dim=1, ) test_records = FileSystemRecordSet( file_system_id="fs-fd85e556", file_system_type="EFS", directory_path="ipinsights", num_records=20, feature_dim=1, channel="validation", ) inputs = [records, test_records] input_dict = _Job._format_record_set_list_input(inputs) assert isinstance(input_dict["train"], FileSystemInput) assert isinstance(input_dict["validation"], FileSystemInput)
def test_format_inputs_to_input_config_file_system_record_set(): file_system_id = "fs-0a48d2a1" file_system_type = "EFS" directory_path = "ipinsights" num_records = 1 feature_dim = 1 records = FileSystemRecordSet( file_system_id=file_system_id, file_system_type=file_system_type, directory_path=directory_path, num_records=num_records, feature_dim=feature_dim, ) channels = _Job._format_inputs_to_input_config(records) assert channels[0]["DataSource"]["FileSystemDataSource"]["DirectoryPath"] == directory_path assert channels[0]["DataSource"]["FileSystemDataSource"]["FileSystemId"] == file_system_id assert channels[0]["DataSource"]["FileSystemDataSource"]["FileSystemType"] == file_system_type assert channels[0]["DataSource"]["FileSystemDataSource"]["FileSystemAccessMode"] == "ro"