def test_tuning_lda(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'lda') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int( all_records[0].features['values'].float32_tensor.shape[0]) lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-lda') record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set.channel = 'test' # specify which hp you want to optimize over hyperparameter_ranges = { 'alpha0': ContinuousParameter(1, 10), 'num_topics': IntegerParameter(1, 2) } objective_metric_name = 'test:pwll' tuner = HyperparameterTuner( estimator=lda, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Maximize', max_jobs=2, max_parallel_jobs=2) tuner.fit([record_set, test_record_set], mini_batch_size=1) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label['topic_mixture'] is not None
def test_ntm(sagemaker_session): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'ntm') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features['values'].float32_tensor.shape[0]) ntm = NTM(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-ntm') record_set = prepare_record_set_from_local_files(data_path, ntm.data_location, len(all_records), feature_num, sagemaker_session) ntm.fit(record_set, None) endpoint_name = unique_name_from_base('ntm') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = NTMModel(ntm.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["topic_weights"] is not None
def test_ntm_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): data_path = os.path.join(DATA_DIR, "ntm") data_filename = "nips-train_1.pbr" with open(os.path.join(data_path, data_filename), "rb") as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features["values"].float32_tensor.shape[0]) ntm = NTM( role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, train_instance_type=cpu_instance_type, num_topics=10, sagemaker_session=sagemaker_session, ) records = prepare_record_set_from_local_files( data_path, ntm.data_location, len(all_records), feature_num, sagemaker_session ) training_config = _build_airflow_workflow( estimator=ntm, instance_type=cpu_instance_type, inputs=records ) _assert_that_s3_url_contains_data( sagemaker_session, training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"], )
def test_ipinsights_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): data_path = os.path.join(DATA_DIR, "ipinsights") data_filename = "train.csv" with open(os.path.join(data_path, data_filename), "rb") as f: num_records = len(f.readlines()) ipinsights = IPInsights( role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, train_instance_type=cpu_instance_type, num_entity_vectors=10, vector_dim=100, sagemaker_session=sagemaker_session, ) records = prepare_record_set_from_local_files( data_path, ipinsights.data_location, num_records, None, sagemaker_session ) training_config = _build_airflow_workflow( estimator=ipinsights, instance_type=cpu_instance_type, inputs=records ) _assert_that_s3_url_contains_data( sagemaker_session, training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]["S3Uri"], )
def test_lda(sagemaker_session): with timeout(minutes=15): data_path = os.path.join(DATA_DIR, 'lda') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features['values'].float32_tensor.shape[0]) lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-lda') record_set = prepare_record_set_from_local_files(data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) lda.fit(record_set, 100) endpoint_name = name_from_base('lda') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = LDAModel(lda.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["topic_mixture"] is not None
def test_lda(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("lda") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "lda") data_filename = "nips-train_1.pbr" with open(os.path.join(data_path, data_filename), "rb") as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features["values"].float32_tensor.shape[0]) lda = LDA( role="SageMakerRole", instance_type=cpu_instance_type, num_topics=10, sagemaker_session=sagemaker_session, ) record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session ) lda.fit(records=record_set, mini_batch_size=100, job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = LDAModel(lda.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["topic_mixture"] is not None
def test_ntm(sagemaker_session): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'ntm') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features['values'].float32_tensor.shape[0]) ntm = NTM(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-ntm') record_set = prepare_record_set_from_local_files(data_path, ntm.data_location, len(all_records), feature_num, sagemaker_session) ntm.fit(record_set, None) endpoint_name = name_from_base('ntm') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = NTMModel(ntm.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["topic_weights"] is not None
def test_ipinsights(sagemaker_session): job_name = unique_name_from_base('ipinsights') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'ipinsights') data_filename = 'train.csv' with open(os.path.join(data_path, data_filename), 'rb') as f: num_records = len(f.readlines()) ipinsights = IPInsights(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_entity_vectors=10, vector_dim=100, sagemaker_session=sagemaker_session) record_set = prepare_record_set_from_local_files( data_path, ipinsights.data_location, num_records, FEATURE_DIM, sagemaker_session) ipinsights.fit(records=record_set, job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = IPInsightsModel(ipinsights.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=job_name) assert isinstance(predictor, RealTimePredictor) predict_input = [['user_1', '1.1.1.1']] result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["dot_product"] is not None
def test_ntm_serverless_inference(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("ntm-serverless") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "ntm") data_filename = "nips-train_1.pbr" with open(os.path.join(data_path, data_filename), "rb") as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features["values"].float32_tensor.shape[0]) ntm = NTM( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, num_topics=10, sagemaker_session=sagemaker_session, ) record_set = prepare_record_set_from_local_files( data_path, ntm.data_location, len(all_records), feature_num, sagemaker_session ) ntm.fit(records=record_set, job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = NTMModel(ntm.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy( serverless_inference_config=ServerlessInferenceConfig(), endpoint_name=job_name ) assert isinstance(predictor, Predictor)
def test_tuning_lda(sagemaker_session): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'lda') data_filename = 'nips-train_1.pbr' with open(os.path.join(data_path, data_filename), 'rb') as f: all_records = read_records(f) # all records must be same feature_num = int(all_records[0].features['values'].float32_tensor.shape[0]) lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10, sagemaker_session=sagemaker_session, base_job_name='test-lda') record_set = prepare_record_set_from_local_files(data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set = prepare_record_set_from_local_files(data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set.channel = 'test' # specify which hp you want to optimize over hyperparameter_ranges = {'alpha0': ContinuousParameter(1, 10), 'num_topics': IntegerParameter(1, 2)} objective_metric_name = 'test:pwll' tuner = HyperparameterTuner(estimator=lda, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type='Maximize', max_jobs=2, max_parallel_jobs=2) tuner.fit([record_set, test_record_set], mini_batch_size=1) print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name) time.sleep(15) tuner.wait() best_training_job = tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, 'ml.c4.xlarge') predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label['topic_mixture'] is not None
def test_object2vec(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("object2vec") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "object2vec") data_filename = "train.jsonl" with open(os.path.join(data_path, data_filename), "r") as f: num_records = len(f.readlines()) object2vec = Object2Vec( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, epochs=3, enc0_max_seq_len=20, enc0_vocab_size=45000, enc_dim=16, num_classes=3, negative_sampling_rate=0, comparator_list="hadamard,concat,abs_diff", tied_token_embedding_weight=False, token_embedding_storage_type="dense", sagemaker_session=sagemaker_session, ) record_set = prepare_record_set_from_local_files( data_path, object2vec.data_location, num_records, FEATURE_NUM, sagemaker_session) object2vec.fit(records=record_set, job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = Object2VecModel(object2vec.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) assert isinstance(predictor, Predictor) predict_input = {"instances": [{"in0": [354, 623], "in1": [16]}]} result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["scores"] is not None
def test_object2vec(sagemaker_session): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, 'object2vec') data_filename = 'train.jsonl' with open(os.path.join(data_path, data_filename), 'r') as f: num_records = len(f.readlines()) object2vec = Object2Vec(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', epochs=3, enc0_max_seq_len=20, enc0_vocab_size=45000, enc_dim=16, num_classes=3, sagemaker_session=sagemaker_session, base_job_name='test-object2vec') record_set = prepare_record_set_from_local_files( data_path, object2vec.data_location, num_records, FEATURE_NUM, sagemaker_session) object2vec.fit(record_set, None) endpoint_name = unique_name_from_base('object2vec') with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): model = Object2VecModel(object2vec.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session) predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name) assert isinstance(predictor, RealTimePredictor) predict_input = {'instances': [{"in0": [354, 623], "in1": [16]}]} result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["scores"] is not None
def test_ipinsights_serverless_inference(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("ipinsights-serverless") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "ipinsights") data_filename = "train.csv" with open(os.path.join(data_path, data_filename), "rb") as f: num_records = len(f.readlines()) ipinsights = IPInsights( role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, num_entity_vectors=10, vector_dim=100, sagemaker_session=sagemaker_session, ) record_set = prepare_record_set_from_local_files( data_path, ipinsights.data_location, num_records, FEATURE_DIM, sagemaker_session ) ipinsights.fit(records=record_set, job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = IPInsightsModel( ipinsights.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) predictor = model.deploy( serverless_inference_config=ServerlessInferenceConfig(memory_size_in_mb=6144), endpoint_name=job_name, ) assert isinstance(predictor, Predictor) predict_input = [["user_1", "1.1.1.1"]] result = predictor.predict(predict_input) assert len(result["predictions"]) == 1 assert 0 > result["predictions"][0]["dot_product"] > -1 # We expect ~ -0.22
def test_tuning_lda(sagemaker_session, cpu_instance_type): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "lda") data_filename = "nips-train_1.pbr" with open(os.path.join(data_path, data_filename), "rb") as f: all_records = read_records(f) # all records must be same feature_num = int( all_records[0].features["values"].float32_tensor.shape[0]) lda = LDA( role="SageMakerRole", instance_type=cpu_instance_type, num_topics=10, sagemaker_session=sagemaker_session, ) record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set = prepare_record_set_from_local_files( data_path, lda.data_location, len(all_records), feature_num, sagemaker_session) test_record_set.channel = "test" # specify which hp you want to optimize over hyperparameter_ranges = { "alpha0": ContinuousParameter(1, 10), "num_topics": IntegerParameter(1, 2), } objective_metric_name = "test:pwll" tuner = HyperparameterTuner( estimator=lda, objective_metric_name=objective_metric_name, hyperparameter_ranges=hyperparameter_ranges, objective_type="Maximize", max_jobs=2, max_parallel_jobs=2, early_stopping_type="Auto", ) tuning_job_name = unique_name_from_base("test-lda", max_length=32) print("Started hyperparameter tuning job with name:" + tuning_job_name) tuner.fit([record_set, test_record_set], mini_batch_size=1, job_name=tuning_job_name) attached_tuner = HyperparameterTuner.attach( tuning_job_name, sagemaker_session=sagemaker_session) assert attached_tuner.early_stopping_type == "Auto" assert attached_tuner.estimator.alpha0 == 1.0 assert attached_tuner.estimator.num_topics == 1 best_training_job = attached_tuner.best_training_job() with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session): predictor = tuner.deploy(1, cpu_instance_type) predict_input = np.random.rand(1, feature_num) result = predictor.predict(predict_input) assert len(result) == 1 for record in result: assert record.label["topic_mixture"] is not None