def test_transform_byo_estimator(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "one_p_mnist") pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} tags = [{"Key": "some-tag", "Value": "value-for-tag"}] # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, "mnist.pkl.gz") with gzip.open(train_set_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans( role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, k=10, sagemaker_session=sagemaker_session, output_path="s3://{}/".format(sagemaker_session.default_bucket()), ) # set kmeans specific hp kmeans.init_method = "random" kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = "kmeans++" kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) job_name = unique_name_from_base("test-kmeans-attach") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records, job_name=job_name) estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) estimator._enable_network_isolation = True transform_input_path = os.path.join(data_path, "transform_input.csv") transform_input_key_prefix = "integ-test-data/one_p_mnist/transform" transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix ) transformer = estimator.transformer(1, cpu_instance_type, tags=tags) transformer.transform(transform_input, content_type="text/csv") with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES ): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name ) assert model_desc["EnableNetworkIsolation"] model_tags = sagemaker_session.sagemaker_client.list_tags( ResourceArn=model_desc["ModelArn"] )["Tags"] assert tags == model_tags
def test_async_byo_estimator(sagemaker_session, region): image_name = registry(region) + "/factorization-machines:1" endpoint_name = unique_name_from_base('byo') training_data_path = os.path.join(DATA_DIR, 'dummy_tensor') training_job_name = "" with timeout(minutes=5): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else { 'encoding': 'latin1' } with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) prefix = 'test_byo_estimator' key = 'recordio-pb-data' s3_train_data = sagemaker_session.upload_data(path=training_data_path, key_prefix=os.path.join( prefix, 'train', key)) estimator = Estimator(image_name=image_name, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-byo') estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type='binary_classifier') # training labels must be 'float32' estimator.fit({'train': s3_train_data}, wait=False) training_job_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) model = estimator.create_model() predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) predictor.serializer = fm_serializer predictor.content_type = 'application/json' predictor.deserializer = sagemaker.predictor.json_deserializer result = predictor.predict(train_set[0][:10]) assert len(result['predictions']) == 10 for prediction in result['predictions']: assert prediction['score'] is not None assert estimator.train_image() == image_name
def test_async_byo_estimator(sagemaker_session, region): image_name = registry(region) + "/factorization-machines:1" endpoint_name = unique_name_from_base("byo") training_data_path = os.path.join(DATA_DIR, "dummy_tensor") job_name = unique_name_from_base("byo") with timeout(minutes=5): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") pickle_args = {} if sys.version_info.major == 2 else { "encoding": "latin1" } with gzip.open(data_path, "rb") as f: train_set, _, _ = pickle.load(f, **pickle_args) prefix = "test_byo_estimator" key = "recordio-pb-data" s3_train_data = sagemaker_session.upload_data(path=training_data_path, key_prefix=os.path.join( prefix, "train", key)) estimator = Estimator( image_name=image_name, role="SageMakerRole", train_instance_count=1, train_instance_type="ml.c4.xlarge", sagemaker_session=sagemaker_session, ) estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type="binary_classifier") # training labels must be 'float32' estimator.fit({"train": s3_train_data}, wait=False, job_name=job_name) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) model = estimator.create_model() predictor = model.deploy(1, "ml.m4.xlarge", endpoint_name=endpoint_name) predictor.serializer = fm_serializer predictor.content_type = "application/json" predictor.deserializer = sagemaker.predictor.json_deserializer result = predictor.predict(train_set[0][:10]) assert len(result["predictions"]) == 10 for prediction in result["predictions"]: assert prediction["score"] is not None assert estimator.train_image() == image_name
def load_model(self, model_name: str = "") -> None: """ Load the already trained model to not have to train again. Arguments: model_name: The name of the training job, as provided by AWS """ LOGGER.info(f"Loading already trained model {model_name}") self._model = Estimator.attach(training_job_name=model_name, sagemaker_session=self.executor.session)
def test_async_byo_estimator(sagemaker_session, region): image_name = registry(region) + "/factorization-machines:1" endpoint_name = name_from_base('byo') training_job_name = "" with timeout(minutes=5): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) # take 100 examples for faster execution vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32') labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32') buf = io.BytesIO() write_numpy_to_dense_tensor(buf, vectors, labels) buf.seek(0) bucket = sagemaker_session.default_bucket() prefix = 'test_byo_estimator' key = 'recordio-pb-data' boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf) s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key) estimator = Estimator(image_name=image_name, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-byo') estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type='binary_classifier') # training labels must be 'float32' estimator.fit({'train': s3_train_data}, wait=False) training_job_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) model = estimator.create_model() predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) predictor.serializer = fm_serializer predictor.content_type = 'application/json' predictor.deserializer = sagemaker.predictor.json_deserializer result = predictor.predict(train_set[0][:10]) assert len(result['predictions']) == 10 for prediction in result['predictions']: assert prediction['score'] is not None assert estimator.train_image() == image_name
def test_transform_byo_estimator(sagemaker_session): data_path = os.path.join(DATA_DIR, 'one_p_mnist') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} tags = [{'Key': 'some-tag', 'Value': 'value-for-tag'}] # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, 'mnist.pkl.gz') with gzip.open(train_set_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) kmeans = KMeans(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', k=10, sagemaker_session=sagemaker_session, output_path='s3://{}/'.format( sagemaker_session.default_bucket())) # set kmeans specific hp kmeans.init_method = 'random' kmeans.max_iterators = 1 kmeans.tol = 1 kmeans.num_trials = 1 kmeans.local_init_method = 'kmeans++' kmeans.half_life_time_size = 1 kmeans.epochs = 1 records = kmeans.record_set(train_set[0][:100]) job_name = unique_name_from_base('test-kmeans-attach') with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records, job_name=job_name) transform_input_path = os.path.join(data_path, 'transform_input.csv') transform_input_key_prefix = 'integ-test-data/one_p_mnist/transform' transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix) estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) transformer = estimator.transformer(1, 'ml.m4.xlarge', tags=tags) transformer.transform(transform_input, content_type='text/csv') with timeout_and_delete_model_with_transformer( transformer, sagemaker_session, minutes=TRANSFORM_DEFAULT_TIMEOUT_MINUTES): transformer.wait() model_desc = sagemaker_session.sagemaker_client.describe_model( ModelName=transformer.model_name) model_tags = sagemaker_session.sagemaker_client.list_tags( ResourceArn=model_desc['ModelArn'])['Tags'] assert tags == model_tags
def test_async_byo_estimator(sagemaker_session, region): image_name = registry(region) + "/factorization-machines:1" endpoint_name = name_from_base('byo') training_job_name = "" with timeout(minutes=5): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) # take 100 examples for faster execution vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32') labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32') buf = io.BytesIO() write_numpy_to_dense_tensor(buf, vectors, labels) buf.seek(0) bucket = sagemaker_session.default_bucket() prefix = 'test_byo_estimator' key = 'recordio-pb-data' boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf) s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key) estimator = Estimator(image_name=image_name, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-byo') estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type='binary_classifier') # training labels must be 'float32' estimator.fit({'train': s3_train_data}, wait=False) training_job_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) model = estimator.create_model() predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) predictor.serializer = fm_serializer predictor.content_type = 'application/json' predictor.deserializer = sagemaker.predictor.json_deserializer result = predictor.predict(train_set[0][:10]) assert len(result['predictions']) == 10 for prediction in result['predictions']: assert prediction['score'] is not None assert estimator.train_image() == image_name
def test_async_byo_estimator(sagemaker_session, region, cpu_instance_type, training_set): image_uri = image_uris.retrieve("factorization-machines", region) endpoint_name = unique_name_from_base("byo") training_data_path = os.path.join(DATA_DIR, "dummy_tensor") job_name = unique_name_from_base("byo") with timeout(minutes=5): prefix = "test_byo_estimator" key = "recordio-pb-data" s3_train_data = sagemaker_session.upload_data(path=training_data_path, key_prefix=os.path.join( prefix, "train", key)) estimator = Estimator( image_uri=image_uri, role="SageMakerRole", instance_count=1, instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, ) estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type="binary_classifier") # training labels must be 'float32' estimator.fit({"train": s3_train_data}, wait=False, job_name=job_name) with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) model = estimator.create_model() predictor = model.deploy( 1, cpu_instance_type, endpoint_name=endpoint_name, serializer=_FactorizationMachineSerializer(), deserializer=sagemaker.deserializers.JSONDeserializer(), ) result = predictor.predict(training_set[0][:10]) assert len(result["predictions"]) == 10 for prediction in result["predictions"]: assert prediction["score"] is not None assert estimator.training_image_uri() == image_uri
def test_async_byo_estimator(sagemaker_session, region): image_name = registry(region) + "/factorization-machines:1" endpoint_name = name_from_base('byo') training_data_path = os.path.join(DATA_DIR, 'dummy_tensor') training_job_name = "" with timeout(minutes=5): data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz') pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'} with gzip.open(data_path, 'rb') as f: train_set, _, _ = pickle.load(f, **pickle_args) prefix = 'test_byo_estimator' key = 'recordio-pb-data' s3_train_data = sagemaker_session.upload_data(path=training_data_path, key_prefix=os.path.join(prefix, 'train', key)) estimator = Estimator(image_name=image_name, role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-byo') estimator.set_hyperparameters(num_factors=10, feature_dim=784, mini_batch_size=100, predictor_type='binary_classifier') # training labels must be 'float32' estimator.fit({'train': s3_train_data}, wait=False) training_job_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) model = estimator.create_model() predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name) predictor.serializer = fm_serializer predictor.content_type = 'application/json' predictor.deserializer = sagemaker.predictor.json_deserializer result = predictor.predict(train_set[0][:10]) assert len(result['predictions']) == 10 for prediction in result['predictions']: assert prediction['score'] is not None assert estimator.train_image() == image_name