def test_ipinsights(sagemaker_session): job_name = unique_name_from_base('ipinsights') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'ipinsights') data_filename = 'train.csv' with open(os.path.join(data_path, data_filename), 'rb') as f: num_records = len(f.readlines()) ipinsights = IPInsights(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_entity_vectors=10, vector_dim=100, sagemaker_session=sagemaker_session) record_set = prepare_record_set_from_local_files( data_path, ipinsights.data_location, num_records, FEATURE_DIM, sagemaker_session) ipinsights.fit(records=record_set, job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = IPInsightsModel(ipinsights.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=job_name) assert isinstance(predictor, RealTimePredictor) predict_input = [['user_1', '1.1.1.1']] result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["dot_product"] is not None
def test_ipinsights_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): data_path = os.path.join(DATA_DIR, "ipinsights") data_filename = "train.csv" with open(os.path.join(data_path, data_filename), "rb") as f: num_records = len(f.readlines()) ipinsights = IPInsights( role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, train_instance_type=cpu_instance_type, num_entity_vectors=10, vector_dim=100, sagemaker_session=sagemaker_session, ) records = prepare_record_set_from_local_files( data_path, ipinsights.data_location, num_records, None, sagemaker_session ) training_config = _build_airflow_workflow( estimator=ipinsights, instance_type=cpu_instance_type, inputs=records ) _assert_that_s3_url_contains_data( sagemaker_session, training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"], )
def test_ipinsights_serverless_inference(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("ipinsights-serverless") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "ipinsights") data_filename = "train.csv" with open(os.path.join(data_path, data_filename), "rb") as f: num_records = len(f.readlines()) ipinsights = IPInsights( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, num_entity_vectors=10, vector_dim=100, sagemaker_session=sagemaker_session, ) record_set = prepare_record_set_from_local_files( data_path, ipinsights.data_location, num_records, FEATURE_DIM, sagemaker_session ) ipinsights.fit(records=record_set, job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = IPInsightsModel( ipinsights.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) predictor = model.deploy( serverless_inference_config=ServerlessInferenceConfig(memory_size_in_mb=6144), endpoint_name=job_name, ) assert isinstance(predictor, Predictor) predict_input = [["user_1", "1.1.1.1"]] result = predictor.predict(predict_input) assert len(result["predictions"]) == 1 assert 0 > result["predictions"][0]["dot_product"] > -1 # We expect ~ -0.22