def test_async_fit(sagemaker_session):
    endpoint_name = 'test-mxnet-attach-deploy-{}'.format(sagemaker_timestamp())

    with timeout(minutes=5):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        mx = MXNet(entry_point=script_path, role='SageMakerRole',
                   train_instance_count=1, train_instance_type='ml.c4.xlarge',
                   sagemaker_session=sagemaker_session)

        train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                       key_prefix='integ-test-data/mxnet_mnist/train')
        test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                      key_prefix='integ-test-data/mxnet_mnist/test')

        mx.fit({'train': train_input, 'test': test_input}, wait=False)
        training_job_name = mx.latest_training_job.name

        print("Waiting to re-attach to the training job: %s" % training_job_name)
        time.sleep(20)

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        print("Re-attaching now to: %s" % training_job_name)
        estimator = MXNet.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        data = numpy.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
def test_linear_learner_multiclass(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        train_set = train_set[0], train_set[1].astype(np.dtype('float32'))

        ll = LinearLearner('SageMakerRole', 1, 'ml.c4.2xlarge', base_job_name='test-linear-learner',
                           predictor_type='multiclass_classifier', num_classes=10, sagemaker_session=sagemaker_session)

        ll.epochs = 1
        ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]))

    endpoint_name = name_from_base('linear-learner')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):

        predictor = ll.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

        result = predictor.predict(train_set[0][0:100])
        assert len(result) == 100
        for record in result:
            assert record.label["predicted_label"] is not None
            assert record.label["score"] is not None
Ejemplo n.º 3
0
def test_tf(sagemaker_session, tf_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
def fixture_training_job(sagemaker_session, pytorch_full_version):
    instance_type = 'ml.c4.xlarge'
    with timeout(minutes=15):
        pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type)

        pytorch.fit({'training': _upload_training_data(pytorch)})
        return pytorch.latest_training_job.name
Ejemplo n.º 5
0
def test_knn_regressor(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        knn = KNN(role='SageMakerRole', train_instance_count=1,
                  train_instance_type='ml.c4.xlarge',
                  k=10, predictor_type='regressor', sample_size=500,
                  sagemaker_session=sagemaker_session, base_job_name='test-knn-rr')

        # training labels must be 'float32'
        knn.fit(knn.record_set(train_set[0][:200], train_set[1][:200].astype('float32')))

    endpoint_name = name_from_base('knn')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = KNNModel(knn.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["score"] is not None
Ejemplo n.º 6
0
def test_failed_tf_training(sagemaker_session, tf_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'failure_script.py')
        ec2_client = sagemaker_session.boto_session.client('ec2')
        subnet, security_group_id = get_or_create_subnet_and_security_group(ec2_client, VPC_NAME)
        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               subnets=[subnet],
                               security_group_ids=[security_group_id])

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf-failure')

        with pytest.raises(ValueError) as e:
            estimator.fit(inputs)
        assert 'This failure is expected' in str(e.value)

        job_desc = estimator.sagemaker_session.sagemaker_client.describe_training_job(
            TrainingJobName=estimator.latest_training_job.name)
        assert [subnet] == job_desc['VpcConfig']['Subnets']
        assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds']
Ejemplo n.º 7
0
def test_tf_async(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs, wait=False)
        training_job_name = estimator.latest_training_job.name
        time.sleep(20)

    endpoint_name = training_job_name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
        print('predict result: {}'.format(result))
Ejemplo n.º 8
0
def test_ntm(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'ntm')
        data_filename = 'nips-train_1.pbr'

        with open(os.path.join(data_path, data_filename), 'rb') as f:
            all_records = read_records(f)

        # all records must be same
        feature_num = int(all_records[0].features['values'].float32_tensor.shape[0])

        ntm = NTM(role='SageMakerRole', train_instance_count=1, train_instance_type='ml.c4.xlarge', num_topics=10,
                  sagemaker_session=sagemaker_session, base_job_name='test-ntm')

        record_set = prepare_record_set_from_local_files(data_path, ntm.data_location,
                                                         len(all_records), feature_num, sagemaker_session)
        ntm.fit(record_set, None)

    endpoint_name = name_from_base('ntm')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = NTMModel(ntm.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["topic_weights"] is not None
Ejemplo n.º 9
0
def test_cifar(sagemaker_session, tf_full_version):
    with timeout(minutes=45):
        script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')

        dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')

        estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole',
                               framework_version=tf_full_version, training_steps=500, evaluation_steps=5,
                               train_instance_count=2, train_instance_type='ml.p2.xlarge',
                               sagemaker_session=sagemaker_session, train_max_run=45 * 60,
                               base_job_name='test-cifar')

        inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')
        estimator.fit(inputs, logs=False)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')
        predictor.serializer = PickleSerializer()
        predictor.content_type = PICKLE_CONTENT_TYPE

        data = np.random.randn(32, 32, 3)
        predict_response = predictor.predict(data)
        assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
def test_factorization_machines(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        fm = FactorizationMachines(role='SageMakerRole', train_instance_count=1,
                                   train_instance_type='ml.c4.xlarge',
                                   num_factors=10, predictor_type='regressor',
                                   epochs=2, clip_gradient=1e2, eps=0.001, rescale_grad=1.0 / 100,
                                   sagemaker_session=sagemaker_session, base_job_name='test-fm')

        # training labels must be 'float32'
        fm.fit(fm.record_set(train_set[0][:200], train_set[1][:200].astype('float32')))

    endpoint_name = name_from_base('fm')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = FactorizationMachinesModel(fm.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["score"] is not None
Ejemplo n.º 11
0
def test_lda(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'lda')
        data_filename = 'nips-train_1.pbr'

        with open(os.path.join(data_path, data_filename), 'rb') as f:
            all_records = read_records(f)

        # all records must be same
        feature_num = int(all_records[0].features['values'].float32_tensor.shape[0])

        lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10,
                  sagemaker_session=sagemaker_session, base_job_name='test-lda')

        record_set = prepare_record_set_from_local_files(data_path, lda.data_location,
                                                         len(all_records), feature_num, sagemaker_session)
        lda.fit(record_set, 100)

    endpoint_name = name_from_base('lda')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = LDAModel(lda.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)

        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["topic_mixture"] is not None
Ejemplo n.º 12
0
def test_pca(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        pca = sagemaker.amazon.pca.PCA(role='SageMakerRole', train_instance_count=1,
                                       train_instance_type='ml.m4.xlarge',
                                       num_components=48, sagemaker_session=sagemaker_session, base_job_name='test-pca')

        pca.algorithm_mode = 'randomized'
        pca.subtract_mean = True
        pca.extra_components = 5
        pca.fit(pca.record_set(train_set[0][:100]))

    endpoint_name = name_from_base('pca')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        pca_model = sagemaker.amazon.pca.PCAModel(model_data=pca.model_data, role='SageMakerRole',
                                                  sagemaker_session=sagemaker_session)
        predictor = pca_model.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge",
                                     endpoint_name=endpoint_name)

        result = predictor.predict(train_set[0][:5])

        assert len(result) == 5
        for record in result:
            assert record.label["projection"] is not None
Ejemplo n.º 13
0
def test_tuning_mxnet(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'tuning.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        estimator = MXNet(entry_point=script_path,
                          role='SageMakerRole',
                          train_instance_count=1,
                          train_instance_type='ml.m4.xlarge',
                          sagemaker_session=sagemaker_session,
                          base_job_name='tune-mxnet')

        hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.01, 0.2)}
        objective_metric_name = 'Validation-accuracy'
        metric_definitions = [{'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}]
        tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions,
                                    max_jobs=4, max_parallel_jobs=2)

        train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                              key_prefix='integ-test-data/mxnet_mnist/train')
        test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                             key_prefix='integ-test-data/mxnet_mnist/test')
        tuner.fit({'train': train_input, 'test': test_input})

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        data = np.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
def test_async_fit_deploy(sagemaker_session, pytorch_full_version):
    training_job_name = ""
    # TODO: add tests against local mode when it's ready to be used
    instance_type = 'ml.p2.xlarge'

    with timeout(minutes=10):
        pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, instance_type)

        pytorch.fit({'training': _upload_training_data(pytorch)}, wait=False)
        training_job_name = pytorch.latest_training_job.name

        print("Waiting to re-attach to the training job: %s" % training_job_name)
        time.sleep(20)

    if not _is_local_mode(instance_type):
        endpoint_name = 'test-pytorch-async-fit-attach-deploy-{}'.format(sagemaker_timestamp())

        with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
            print("Re-attaching now to: %s" % training_job_name)
            estimator = PyTorch.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
            predictor = estimator.deploy(1, instance_type, endpoint_name=endpoint_name)

            batch_size = 100
            data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32)
            output = predictor.predict(data)

            assert output.shape == (batch_size, 10)
def test_failed_training_job(sagemaker_session, pytorch_full_version):
    script_path = os.path.join(MNIST_DIR, 'failure_script.py')

    with timeout(minutes=15):
        pytorch = _get_pytorch_estimator(sagemaker_session, pytorch_full_version, entry_point=script_path)

        with pytest.raises(ValueError) as e:
            pytorch.fit(_upload_training_data(pytorch))
        assert 'This failure is expected' in str(e.value)
Ejemplo n.º 16
0
def test_async_kmeans(sagemaker_session):
    training_job_name = ""
    endpoint_name = name_from_base('kmeans')

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='test-kmeans')

        kmeans.init_method = 'random'
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1

        assert kmeans.hyperparameters() == dict(
            init_method=kmeans.init_method,
            local_lloyd_max_iter=str(kmeans.max_iterations),
            local_lloyd_tol=str(kmeans.tol),
            local_lloyd_num_trials=str(kmeans.num_trials),
            local_lloyd_init_method=kmeans.local_init_method,
            half_life_time_size=str(kmeans.half_life_time_size),
            epochs=str(kmeans.epochs),
            extra_center_factor=str(kmeans.center_factor),
            k=str(kmeans.k),
            force_dense='True',
        )

        kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False)
        training_job_name = kmeans.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)
        print("attaching now...")

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = KMeans.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = KMeansModel(estimator.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["closest_cluster"] is not None
            assert record.label["distance_to_cluster"] is not None
def test_async_byo_estimator(sagemaker_session, region):
    image_name = registry(region) + "/factorization-machines:1"
    endpoint_name = name_from_base('byo')
    training_job_name = ""

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        # take 100 examples for faster execution
        vectors = np.array([t.tolist() for t in train_set[0][:100]]).astype('float32')
        labels = np.where(np.array([t.tolist() for t in train_set[1][:100]]) == 0, 1.0, 0.0).astype('float32')

        buf = io.BytesIO()
        write_numpy_to_dense_tensor(buf, vectors, labels)
        buf.seek(0)

        bucket = sagemaker_session.default_bucket()
        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'
        boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
        s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data}, wait=False)
        training_job_name = estimator.latest_training_job.name

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None

        assert estimator.train_image() == image_name
def test_byo_estimator(sagemaker_session, region):
    """Use Factorization Machines algorithm as an example here.

    First we need to prepare data for training. We take standard data set, convert it to the
    format that the algorithm can process and upload it to S3.
    Then we create the Estimator and set hyperparamets as required by the algorithm.
    Next, we can call fit() with path to the S3.
    Later the trained model is deployed and prediction is called against the endpoint.
    Default predictor is updated with json serializer and deserializer.

    """
    image_name = registry(region) + "/factorization-machines:1"
    training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'

        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(prefix, 'train', key))

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data})

    endpoint_name = name_from_base('byo')

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None
Ejemplo n.º 19
0
def test_tuning_kmeans(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(role='SageMakerRole', train_instance_count=1,
                        train_instance_type='ml.c4.xlarge',
                        k=10, sagemaker_session=sagemaker_session, base_job_name='tk',
                        output_path='s3://{}/'.format(sagemaker_session.default_bucket()))

        # set kmeans specific hp
        kmeans.init_method = 'random'
        kmeans.max_iterators = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = 'kmeans++'
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1

        records = kmeans.record_set(train_set[0][:100])
        test_records = kmeans.record_set(train_set[0][:100], channel='test')

        # specify which hp you want to optimize over
        hyperparameter_ranges = {'extra_center_factor': IntegerParameter(1, 10),
                                 'mini_batch_size': IntegerParameter(10, 100),
                                 'epochs': IntegerParameter(1, 2),
                                 'init_method': CategoricalParameter(['kmeans++', 'random'])}
        objective_metric_name = 'test:msd'

        tuner = HyperparameterTuner(estimator=kmeans, objective_metric_name=objective_metric_name,
                                    hyperparameter_ranges=hyperparameter_ranges, objective_type='Minimize', max_jobs=2,
                                    max_parallel_jobs=2)

        tuner.fit([records, test_records])

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label['closest_cluster'] is not None
            assert record.label['distance_to_cluster'] is not None
def test_sync_fit_deploy(pytorch_training_job, sagemaker_session):
    # TODO: add tests against local mode when it's ready to be used
    endpoint_name = 'test-pytorch-sync-fit-attach-deploy{}'.format(sagemaker_timestamp())
    with timeout(minutes=20):
        estimator = PyTorch.attach(pytorch_training_job, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        data = numpy.zeros(shape=(1, 1, 28, 28), dtype=numpy.float32)
        predictor.predict(data)

        batch_size = 100
        data = numpy.random.rand(batch_size, 1, 28, 28).astype(numpy.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
Ejemplo n.º 21
0
def test_tuning_chainer(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        estimator = Chainer(entry_point=script_path,
                            role='SageMakerRole',
                            train_instance_count=1,
                            train_instance_type='ml.c4.xlarge',
                            sagemaker_session=sagemaker_session,
                            hyperparameters={'epochs': 1})

        train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                              key_prefix='integ-test-data/chainer_mnist/train')
        test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                             key_prefix='integ-test-data/chainer_mnist/test')

        hyperparameter_ranges = {'alpha': ContinuousParameter(0.001, 0.005)}

        objective_metric_name = 'Validation-accuracy'
        metric_definitions = [
            {'Name': 'Validation-accuracy', 'Regex': '\[J1\s+\d\.\d+\s+\d\.\d+\s+\d\.\d+\s+(\d\.\d+)'}]

        tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions,
                                    max_jobs=2, max_parallel_jobs=2)

        tuner.fit({'train': train_input, 'test': test_input})

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')

        batch_size = 100
        data = np.zeros((batch_size, 784), dtype='float32')
        output = predictor.predict(data)
        assert len(output) == batch_size

        data = np.zeros((batch_size, 1, 28, 28), dtype='float32')
        output = predictor.predict(data)
        assert len(output) == batch_size

        data = np.zeros((batch_size, 28, 28), dtype='float32')
        output = predictor.predict(data)
        assert len(output) == batch_size
def test_failed_training_job(sagemaker_session, chainer_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'chainer_mnist', 'failure_script.py')
        data_path = os.path.join(DATA_DIR, 'chainer_mnist')

        chainer = Chainer(entry_point=script_path, role='SageMakerRole',
                          framework_version=chainer_full_version,
                          train_instance_count=1, train_instance_type='ml.c4.xlarge',
                          sagemaker_session=sagemaker_session)

        train_input = chainer.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                            key_prefix='integ-test-data/chainer_mnist/train')

        with pytest.raises(ValueError):
            chainer.fit(train_input)
def test_async_fit(sagemaker_session):
    endpoint_name = 'test-chainer-attach-deploy-{}'.format(sagemaker_timestamp())

    with timeout(minutes=5):
        training_job_name = _run_mnist_training_job(sagemaker_session, "ml.c4.xlarge", 1,
                                                    chainer_full_version=CHAINER_VERSION, wait=False)

        print("Waiting to re-attach to the training job: %s" % training_job_name)
        time.sleep(20)

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        print("Re-attaching now to: %s" % training_job_name)
        estimator = Chainer.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1, "ml.c4.xlarge", endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
def test_failed_training_job(sagemaker_session, mxnet_full_version):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'failure_script.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version,
                   train_instance_count=1, train_instance_type='ml.c4.xlarge',
                   sagemaker_session=sagemaker_session)

        train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                       key_prefix='integ-test-data/mxnet_mnist/train-failure')

        with pytest.raises(ValueError) as e:
            mx.fit(train_input)
        assert 'This failure is expected' in str(e.value)
def test_async_byo_estimator(sagemaker_session, region):
    image_name = registry(region) + "/factorization-machines:1"
    endpoint_name = name_from_base('byo')
    training_data_path = os.path.join(DATA_DIR, 'dummy_tensor')
    training_job_name = ""

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        prefix = 'test_byo_estimator'
        key = 'recordio-pb-data'

        s3_train_data = sagemaker_session.upload_data(path=training_data_path,
                                                      key_prefix=os.path.join(prefix, 'train', key))

        estimator = Estimator(image_name=image_name,
                              role='SageMakerRole', train_instance_count=1,
                              train_instance_type='ml.c4.xlarge',
                              sagemaker_session=sagemaker_session, base_job_name='test-byo')

        estimator.set_hyperparameters(num_factors=10,
                                      feature_dim=784,
                                      mini_batch_size=100,
                                      predictor_type='binary_classifier')

        # training labels must be 'float32'
        estimator.fit({'train': s3_train_data}, wait=False)
        training_job_name = estimator.latest_training_job.name

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = Estimator.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        model = estimator.create_model()
        predictor = model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
        predictor.serializer = fm_serializer
        predictor.content_type = 'application/json'
        predictor.deserializer = sagemaker.predictor.json_deserializer

        result = predictor.predict(train_set[0][:10])

        assert len(result['predictions']) == 10
        for prediction in result['predictions']:
            assert prediction['score'] is not None

        assert estimator.train_image() == image_name
def mxnet_training_job(sagemaker_session, mxnet_full_version):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        mx = MXNet(entry_point=script_path, role='SageMakerRole', framework_version=mxnet_full_version,
                   train_instance_count=1, train_instance_type='ml.c4.xlarge',
                   sagemaker_session=sagemaker_session)

        train_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                       key_prefix='integ-test-data/mxnet_mnist/train')
        test_input = mx.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                      key_prefix='integ-test-data/mxnet_mnist/test')

        mx.fit({'train': train_input, 'test': test_input})
        return mx.latest_training_job.name
Ejemplo n.º 27
0
def test_tuning_lda(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'lda')
        data_filename = 'nips-train_1.pbr'

        with open(os.path.join(data_path, data_filename), 'rb') as f:
            all_records = read_records(f)

        # all records must be same
        feature_num = int(all_records[0].features['values'].float32_tensor.shape[0])

        lda = LDA(role='SageMakerRole', train_instance_type='ml.c4.xlarge', num_topics=10,
                  sagemaker_session=sagemaker_session, base_job_name='test-lda')

        record_set = prepare_record_set_from_local_files(data_path, lda.data_location,
                                                         len(all_records), feature_num, sagemaker_session)
        test_record_set = prepare_record_set_from_local_files(data_path, lda.data_location,
                                                              len(all_records), feature_num, sagemaker_session)
        test_record_set.channel = 'test'

        # specify which hp you want to optimize over
        hyperparameter_ranges = {'alpha0': ContinuousParameter(1, 10),
                                 'num_topics': IntegerParameter(1, 2)}
        objective_metric_name = 'test:pwll'

        tuner = HyperparameterTuner(estimator=lda, objective_metric_name=objective_metric_name,
                                    hyperparameter_ranges=hyperparameter_ranges, objective_type='Maximize', max_jobs=2,
                                    max_parallel_jobs=2)

        tuner.fit([record_set, test_record_set], mini_batch_size=1)

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label['topic_mixture'] is not None
Ejemplo n.º 28
0
def test_failed_tf_training(sagemaker_session, tf_full_version):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'iris', 'failure_script.py')
        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session)

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf-failure')

        with pytest.raises(ValueError) as e:
            estimator.fit(inputs)
        assert 'This failure is expected' in str(e.value)
def test_tf_local_mode(tf_full_version, sagemaker_local_session):
    local_mode_lock_fd = open(LOCK_PATH, 'w')
    local_mode_lock = local_mode_lock_fd.fileno()
    with timeout(minutes=5):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='local',
                               base_job_name='test-tf',
                               sagemaker_session=sagemaker_local_session)

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH,
                                                         key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    try:
        # Since Local Mode uses the same port for serving, we need a lock in order
        # to allow concurrent test execution. The serving test is really fast so it still
        # makes sense to allow this behavior.
        fcntl.lockf(local_mode_lock, fcntl.LOCK_EX)
        json_predictor = estimator.deploy(initial_instance_count=1,
                                          instance_type='local',
                                          endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
    finally:
        estimator.delete_endpoint()
        time.sleep(5)
        fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
Ejemplo n.º 30
0
def test_tuning_tf(sagemaker_session):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='tune-tf')

        inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        hyperparameter_ranges = {'learning_rate': ContinuousParameter(0.05, 0.2)}

        objective_metric_name = 'loss'
        metric_definitions = [{'Name': 'loss', 'Regex': 'loss = ([0-9\\.]+)'}]

        tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions,
                                    objective_type='Minimize', max_jobs=2, max_parallel_jobs=2)

        tuner.fit(inputs)

        print('Started hyperparameter tuning job with name:' + tuner.latest_tuning_job.name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
def test_factorization_machines(sagemaker_session):
    job_name = unique_name_from_base('fm')

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {
            'encoding': 'latin1'
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        fm = FactorizationMachines(role='SageMakerRole',
                                   train_instance_count=1,
                                   train_instance_type='ml.c4.xlarge',
                                   num_factors=10,
                                   predictor_type='regressor',
                                   epochs=2,
                                   clip_gradient=1e2,
                                   eps=0.001,
                                   rescale_grad=1.0 / 100,
                                   sagemaker_session=sagemaker_session)

        # training labels must be 'float32'
        fm.fit(fm.record_set(train_set[0][:200],
                             train_set[1][:200].astype('float32')),
               job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = FactorizationMachinesModel(fm.model_data,
                                           role='SageMakerRole',
                                           sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=job_name)
        result = predictor.predict(train_set[0][:10])

        assert len(result) == 10
        for record in result:
            assert record.label["score"] is not None
Ejemplo n.º 32
0
def test_async_fit(sagemaker_session):
    with timeout(minutes=5):
        training_job_name = _run_mnist_training_job(
            sagemaker_session,
            "ml.c4.xlarge",
            1,
            chainer_full_version=CHAINER_VERSION,
            wait=False)

        print("Waiting to re-attach to the training job: %s" %
              training_job_name)
        time.sleep(20)

    endpoint_name = unique_name_from_base('test-chainer-async-fit')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        print("Re-attaching now to: %s" % training_job_name)
        estimator = Chainer.attach(training_job_name=training_job_name,
                                   sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(1,
                                     "ml.c4.xlarge",
                                     endpoint_name=endpoint_name)
        _predict_and_assert(predictor)
def mxnet_training_job(sagemaker_session, mxnet_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        mx = MXNet(entry_point=script_path,
                   role='SageMakerRole',
                   framework_version=mxnet_full_version,
                   py_version=PYTHON_VERSION,
                   train_instance_count=1,
                   train_instance_type='ml.c4.xlarge',
                   sagemaker_session=sagemaker_session)

        train_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'train'),
            key_prefix='integ-test-data/mxnet_mnist/train')
        test_input = mx.sagemaker_session.upload_data(
            path=os.path.join(data_path, 'test'),
            key_prefix='integ-test-data/mxnet_mnist/test')

        mx.fit({'train': train_input, 'test': test_input})
        return mx.latest_training_job.name
Ejemplo n.º 34
0
def mxnet_estimator(sagemaker_session, mxnet_full_version, cpu_instance_type):
    mx = MXNet(
        entry_point=os.path.join(MXNET_MNIST_PATH, "mnist.py"),
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=cpu_instance_type,
        sagemaker_session=sagemaker_session,
        framework_version=mxnet_full_version,
    )

    train_input = mx.sagemaker_session.upload_data(
        path=os.path.join(MXNET_MNIST_PATH, "train"),
        key_prefix="integ-test-data/mxnet_mnist/train")
    test_input = mx.sagemaker_session.upload_data(
        path=os.path.join(MXNET_MNIST_PATH, "test"),
        key_prefix="integ-test-data/mxnet_mnist/test")

    job_name = unique_name_from_base("test-mxnet-transform")
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        mx.fit({"train": train_input, "test": test_input}, job_name=job_name)

    return mx
def test_object2vec(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, 'object2vec')
        data_filename = 'train.jsonl'

        with open(os.path.join(data_path, data_filename), 'r') as f:
            num_records = len(f.readlines())

        object2vec = Object2Vec(
            role='SageMakerRole',
            train_instance_count=1,
            train_instance_type='ml.c4.xlarge',
            epochs=3,
            enc0_max_seq_len=20,
            enc0_vocab_size=45000,
            enc_dim=16,
            num_classes=3,
            sagemaker_session=sagemaker_session,
            base_job_name='test-object2vec')

        record_set = prepare_record_set_from_local_files(data_path, object2vec.data_location,
                                                         num_records, FEATURE_NUM, sagemaker_session)

        object2vec.fit(record_set, None)

    endpoint_name = name_from_base('object2vec')

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = Object2VecModel(object2vec.model_data, role='SageMakerRole', sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, 'ml.c4.xlarge', endpoint_name=endpoint_name)
        assert isinstance(predictor, RealTimePredictor)

        predict_input = {'instances': [{"in0": [354, 623], "in1": [16]}]}

        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["scores"] is not None
def test_failed_training_job(sagemaker_session, sklearn_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, "sklearn_mnist", "failure_script.py")
        data_path = os.path.join(DATA_DIR, "sklearn_mnist")

        sklearn = SKLearn(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=sklearn_full_version,
            py_version=PYTHON_VERSION,
            train_instance_count=1,
            train_instance_type="ml.c4.xlarge",
            sagemaker_session=sagemaker_session,
        )

        train_input = sklearn.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"), key_prefix="integ-test-data/sklearn_mnist/train"
        )
        job_name = unique_name_from_base("test-sklearn-failed")

        with pytest.raises(ValueError):
            sklearn.fit(train_input, job_name=job_name)
Ejemplo n.º 37
0
def test_ray_tf(sagemaker_session, rl_ray_full_version):
    source_dir = os.path.join(DATA_DIR, 'ray_cartpole')
    cartpole = 'train_ray.py'

    estimator = RLEstimator(entry_point=cartpole,
                            source_dir=source_dir,
                            toolkit=RLToolkit.RAY,
                            framework=RLFramework.TENSORFLOW,
                            toolkit_version=rl_ray_full_version,
                            sagemaker_session=sagemaker_session,
                            role='SageMakerRole',
                            train_instance_type=CPU_INSTANCE,
                            train_instance_count=1)
    job_name = unique_name_from_base('test-ray-tf')

    with timeout(minutes=15):
        estimator.fit(job_name=job_name)

    with pytest.raises(NotImplementedError) as e:
        estimator.deploy(1, CPU_INSTANCE)
    assert 'Automatic deployment of Ray models is not currently available' in str(
        e.value)
Ejemplo n.º 38
0
def tf_training_job(sagemaker_session, tf_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               checkpoint_path='/opt/ml/model',
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = sagemaker_session.upload_data(
            path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

        return estimator.latest_training_job.name
Ejemplo n.º 39
0
def test_deploy_best_candidate(sagemaker_session, cpu_instance_type):
    auto_ml_utils.create_auto_ml_job_if_not_exist(sagemaker_session)

    auto_ml = AutoML(
        role=ROLE, target_attribute_name=TARGET_ATTRIBUTE_NAME, sagemaker_session=sagemaker_session
    )
    best_candidate = auto_ml.best_candidate(job_name=AUTO_ML_JOB_NAME)
    endpoint_name = unique_name_from_base("sagemaker-auto-ml-best-candidate-test")

    with timeout(minutes=AUTO_ML_DEFAULT_TIMEMOUT_MINUTES):
        auto_ml.deploy(
            candidate=best_candidate,
            initial_instance_count=INSTANCE_COUNT,
            instance_type=cpu_instance_type,
            endpoint_name=endpoint_name,
        )

    endpoint_status = sagemaker_session.sagemaker_client.describe_endpoint(
        EndpointName=endpoint_name
    )["EndpointStatus"]
    assert endpoint_status == "InService"
    sagemaker_session.sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
Ejemplo n.º 40
0
def test_pytorch_12_airflow_config_uploads_data_source_to_s3_when_inputs_not_provided(
    sagemaker_session, cpu_instance_type
):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        estimator = PyTorch(
            entry_point=PYTORCH_MNIST_SCRIPT,
            role=ROLE,
            framework_version="1.2.0",
            train_instance_count=2,
            train_instance_type=cpu_instance_type,
            hyperparameters={"epochs": 6, "backend": "gloo"},
            sagemaker_session=sagemaker_session,
        )

        training_config = _build_airflow_workflow(
            estimator=estimator, instance_type=cpu_instance_type
        )

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["HyperParameters"]["sagemaker_submit_directory"].strip('"'),
        )
Ejemplo n.º 41
0
def test_ipinsights_serverless_inference(sagemaker_session, cpu_instance_type):
    job_name = unique_name_from_base("ipinsights-serverless")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "ipinsights")
        data_filename = "train.csv"

        with open(os.path.join(data_path, data_filename), "rb") as f:
            num_records = len(f.readlines())

            ipinsights = IPInsights(
                role="SageMakerRole",
                instance_count=1,
                instance_type=cpu_instance_type,
                num_entity_vectors=10,
                vector_dim=100,
                sagemaker_session=sagemaker_session,
            )

        record_set = prepare_record_set_from_local_files(
            data_path, ipinsights.data_location, num_records, FEATURE_DIM, sagemaker_session
        )
        ipinsights.fit(records=record_set, job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = IPInsightsModel(
            ipinsights.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session
        )
        predictor = model.deploy(
            serverless_inference_config=ServerlessInferenceConfig(memory_size_in_mb=6144),
            endpoint_name=job_name,
        )
        assert isinstance(predictor, Predictor)

        predict_input = [["user_1", "1.1.1.1"]]
        result = predictor.predict(predict_input)

        assert len(result["predictions"]) == 1
        assert 0 > result["predictions"][0]["dot_product"] > -1  # We expect ~ -0.22
def test_chainer_airflow_config_uploads_data_source_to_s3(
        sagemaker_local_session, cpu_instance_type, chainer_latest_version,
        chainer_latest_py_version):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        script_path = os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
        data_path = os.path.join(DATA_DIR, "chainer_mnist")

        chainer = Chainer(
            entry_point=script_path,
            role=ROLE,
            instance_count=SINGLE_INSTANCE_COUNT,
            instance_type="local",
            framework_version=chainer_latest_version,
            py_version=chainer_latest_py_version,
            sagemaker_session=sagemaker_local_session,
            hyperparameters={"epochs": 1},
            use_mpi=True,
            num_processes=2,
            process_slots_per_host=2,
            additional_mpi_options="-x NCCL_DEBUG=INFO",
        )

        train_input = "file://" + os.path.join(data_path, "train")
        test_input = "file://" + os.path.join(data_path, "test")

        training_config = _build_airflow_workflow(
            estimator=chainer,
            instance_type=cpu_instance_type,
            inputs={
                "train": train_input,
                "test": test_input
            },
        )

        _assert_that_s3_url_contains_data(
            sagemaker_local_session,
            training_config["HyperParameters"]
            ["sagemaker_submit_directory"].strip('"'),
        )
Ejemplo n.º 43
0
def test_attach_tuning_pytorch(sagemaker_session):
    mnist_dir = os.path.join(DATA_DIR, 'pytorch_mnist')
    mnist_script = os.path.join(mnist_dir, 'mnist.py')

    estimator = PyTorch(entry_point=mnist_script, role='SageMakerRole',
                        train_instance_count=1, py_version=PYTHON_VERSION,
                        train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session)

    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        objective_metric_name = 'evaluation-accuracy'
        metric_definitions = [{'Name': 'evaluation-accuracy', 'Regex': r'Overall test accuracy: (\d+)'}]
        hyperparameter_ranges = {'batch-size': IntegerParameter(50, 100)}

        tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges, metric_definitions,
                                    max_jobs=2, max_parallel_jobs=2)

        training_data = estimator.sagemaker_session.upload_data(path=os.path.join(mnist_dir, 'training'),
                                                                key_prefix='integ-test-data/pytorch_mnist/training')
        tuner.fit({'training': training_data})

        tuning_job_name = tuner.latest_tuning_job.name

        print('Started hyperparameter tuning job with name:' + tuning_job_name)

        time.sleep(15)
        tuner.wait()

    attached_tuner = HyperparameterTuner.attach(tuning_job_name, sagemaker_session=sagemaker_session)
    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = attached_tuner.deploy(1, 'ml.c4.xlarge')
        data = np.zeros(shape=(1, 1, 28, 28), dtype=np.float32)
        predictor.predict(data)

        batch_size = 100
        data = np.random.rand(batch_size, 1, 28, 28).astype(np.float32)
        output = predictor.predict(data)

        assert output.shape == (batch_size, 10)
def _run_mnist_training_job(sagemaker_session,
                            instance_type,
                            instance_count,
                            chainer_full_version,
                            wait=True):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):

        script_path = (os.path.join(DATA_DIR, "chainer_mnist", "mnist.py")
                       if instance_type == 1 else os.path.join(
                           DATA_DIR, "chainer_mnist", "distributed_mnist.py"))

        data_path = os.path.join(DATA_DIR, "chainer_mnist")

        chainer = Chainer(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version=chainer_full_version,
            py_version=PYTHON_VERSION,
            train_instance_count=instance_count,
            train_instance_type=instance_type,
            sagemaker_session=sagemaker_session,
            hyperparameters={"epochs": 1},
        )

        train_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"),
            key_prefix="integ-test-data/chainer_mnist/train")
        test_input = chainer.sagemaker_session.upload_data(
            path=os.path.join(data_path, "test"),
            key_prefix="integ-test-data/chainer_mnist/test")

        job_name = unique_name_from_base("test-chainer-training")
        chainer.fit({
            "train": train_input,
            "test": test_input
        },
                    wait=wait,
                    job_name=job_name)
        return chainer.latest_training_job.name
def test_shap(clarify_processor, data_config, model_config, shap_config, sagemaker_session):
    with timeout.timeout(minutes=CLARIFY_DEFAULT_TIMEOUT_MINUTES):
        clarify_processor.run_explainability(
            data_config,
            model_config,
            shap_config,
            model_scores="score",
            job_name=utils.unique_name_from_base("clarify-explainability"),
            wait=True,
        )
        analysis_result_json = s3.S3Downloader.read_file(
            data_config.s3_output_path + "/analysis.json",
            sagemaker_session,
        )
        analysis_result = json.loads(analysis_result_json)
        assert (
            math.fabs(
                analysis_result["explanations"]["kernel_shap"]["label0"]["global_shap_values"]["F2"]
            )
            <= 1
        )
        check_analysis_config(data_config, sagemaker_session, "shap")
Ejemplo n.º 46
0
def test_coach_mxnet(sagemaker_session, rl_coach_full_version):
    estimator = _test_coach(sagemaker_session, RLFramework.MXNET,
                            rl_coach_full_version)

    with timeout(minutes=15):
        estimator.fit(wait='False')

        estimator = RLEstimator.attach(estimator.latest_training_job.name)

    endpoint_name = 'test-mxnet-coach-deploy-{}'.format(sagemaker_timestamp())

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(1,
                                     CPU_INSTANCE,
                                     entry_point='mxnet_deploy.py',
                                     endpoint_name=endpoint_name)

        observation = numpy.asarray([0, 0, 0, 0])
        action = predictor.predict(observation)

    assert 0 < action[0][0] < 1
    assert 0 < action[0][1] < 1
def test_pre_training_bias(clarify_processor, data_config, data_bias_config, sagemaker_session):
    with timeout.timeout(minutes=CLARIFY_DEFAULT_TIMEOUT_MINUTES):
        clarify_processor.run_pre_training_bias(
            data_config,
            data_bias_config,
            job_name=utils.unique_name_from_base("clarify-pretraining-bias"),
            wait=True,
        )
        analysis_result_json = s3.S3Downloader.read_file(
            data_config.s3_output_path + "/analysis.json",
            sagemaker_session,
        )
        analysis_result = json.loads(analysis_result_json)
        assert (
            math.fabs(
                analysis_result["pre_training_bias_metrics"]["facets"]["F1"][0]["metrics"][0][
                    "value"
                ]
            )
            <= 1.0
        )
        check_analysis_config(data_config, sagemaker_session, "pre_training_bias")
def test_mnist_distributed(sagemaker_session, instance_type):
    estimator = TensorFlow(
        entry_point=SCRIPT,
        role=ROLE,
        train_instance_count=2,
        # TODO: change train_instance_type to instance_type once the test is passing consistently
        train_instance_type='ml.c5.xlarge',
        sagemaker_session=sagemaker_session,
        py_version=integ.PYTHON_VERSION,
        script_mode=True,
        framework_version=TensorFlow.LATEST_VERSION,
        distributions=PARAMETER_SERVER_DISTRIBUTION)
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(RESOURCE_PATH, 'data'),
        key_prefix='scriptmode/distributed_mnist')

    with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=inputs,
                      job_name=unique_name_from_base('test-tf-sm-distributed'))
    _assert_s3_files_exist(
        estimator.model_dir,
        ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta'])
Ejemplo n.º 49
0
def _create_and_fit_estimator(sagemaker_session, instance_type, tmpdir):
    job_name = sagemaker.utils.unique_name_from_base("tf-horovod")
    estimator = TensorFlow(
        entry_point=os.path.join(horovod_dir, "hvd_basic.py"),
        role="SageMakerRole",
        train_instance_count=2,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        py_version=integ.PYTHON_VERSION,
        script_mode=True,
        framework_version="1.12",
        distributions={"mpi": {"enabled": True}},
    )

    with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(job_name=job_name)

        tmp = str(tmpdir)
        extract_files_from_s3(estimator.model_data, tmp, sagemaker_session)

        for rank in range(2):
            assert read_json("rank-%s" % rank, tmp)["rank"] == rank
def test_xgboost_airflow_config_uploads_data_source_to_s3(
        sagemaker_session, cpu_instance_type):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        xgboost = XGBoost(
            entry_point=os.path.join(DATA_DIR, "dummy_script.py"),
            framework_version=XGBOOST_LATEST_VERSION,
            role=ROLE,
            sagemaker_session=sagemaker_session,
            train_instance_type=cpu_instance_type,
            train_instance_count=SINGLE_INSTANCE_COUNT,
            base_job_name="XGBoost job",
            py_version=PYTHON_VERSION,
        )

        training_config = _build_airflow_workflow(
            estimator=xgboost, instance_type=cpu_instance_type)

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["HyperParameters"]
            ["sagemaker_submit_directory"].strip('"'),
        )
Ejemplo n.º 51
0
def test_async_pca(sagemaker_session):
    training_job_name = ""
    endpoint_name = unique_name_from_base('pca')

    with timeout(minutes=5):
        data_path = os.path.join(DATA_DIR, 'one_p_mnist', 'mnist.pkl.gz')
        pickle_args = {} if sys.version_info.major == 2 else {'encoding': 'latin1'}

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, 'rb') as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        pca = sagemaker.amazon.pca.PCA(role='SageMakerRole', train_instance_count=1,
                                       train_instance_type='ml.m4.xlarge',
                                       num_components=48, sagemaker_session=sagemaker_session, base_job_name='test-pca')

        pca.algorithm_mode = 'randomized'
        pca.subtract_mean = True
        pca.extra_components = 5
        pca.fit(pca.record_set(train_set[0][:100]), wait=False)
        training_job_name = pca.latest_training_job.name

        print("Detached from training job. Will re-attach in 20 seconds")
        time.sleep(20)

    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = sagemaker.amazon.pca.PCA.attach(training_job_name=training_job_name,
                                                    sagemaker_session=sagemaker_session)

        model = sagemaker.amazon.pca.PCAModel(estimator.model_data, role='SageMakerRole',
                                              sagemaker_session=sagemaker_session)
        predictor = model.deploy(initial_instance_count=1, instance_type="ml.c4.xlarge",
                                 endpoint_name=endpoint_name)

        result = predictor.predict(train_set[0][:5])

        assert len(result) == 5
        for record in result:
            assert record.label["projection"] is not None
Ejemplo n.º 52
0
def _create_and_fit_estimator(mxnet_version, py_version, sagemaker_session, instance_type, tmpdir):
    job_name = sagemaker.utils.unique_name_from_base("mx-horovod")
    estimator = MXNet(
        entry_point=os.path.join(horovod_dir, "hvd_mnist_mxnet.py"),
        role="SageMakerRole",
        instance_count=2,
        instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        py_version=py_version,
        framework_version=mxnet_version,
        distribution={"mpi": {"enabled": True}},
        disable_profiler=True,
    )

    with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(job_name=job_name)

        tmp = str(tmpdir)
        extract_files_from_s3(estimator.model_data, tmp, sagemaker_session)

        for rank in range(2):
            assert read_json("rank-%s" % rank, tmp)["rank"] == rank
Ejemplo n.º 53
0
def test_ntm(sagemaker_session):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, 'ntm')
        data_filename = 'nips-train_1.pbr'

        with open(os.path.join(data_path, data_filename), 'rb') as f:
            all_records = read_records(f)

        # all records must be same
        feature_num = int(
            all_records[0].features['values'].float32_tensor.shape[0])

        ntm = NTM(role='SageMakerRole',
                  train_instance_count=1,
                  train_instance_type='ml.c4.xlarge',
                  num_topics=10,
                  sagemaker_session=sagemaker_session,
                  base_job_name='test-ntm')

        record_set = prepare_record_set_from_local_files(
            data_path, ntm.data_location, len(all_records), feature_num,
            sagemaker_session)
        ntm.fit(record_set, None)

    endpoint_name = name_from_base('ntm')
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        model = NTMModel(ntm.model_data,
                         role='SageMakerRole',
                         sagemaker_session=sagemaker_session)
        predictor = model.deploy(1,
                                 'ml.c4.xlarge',
                                 endpoint_name=endpoint_name)

        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["topic_weights"] is not None
def test_endpoint_config_step(trained_estimator, sfn_client, sagemaker_session,
                              sfn_role_arn):
    # Setup: Create model for trained estimator in SageMaker
    model = trained_estimator.create_model()
    model._create_sagemaker_model(instance_type=INSTANCE_TYPE)
    # End of Setup

    # Build workflow definition
    endpoint_config_name = unique_name_from_base("integ-test-endpoint-config")
    endpoint_config_step = EndpointConfigStep(
        'create_endpoint_config_step',
        endpoint_config_name=endpoint_config_name,
        model_name=model.name,
        initial_instance_count=INSTANCE_COUNT,
        instance_type=INSTANCE_TYPE)
    workflow_graph = Chain([endpoint_config_step])

    with timeout(minutes=DEFAULT_TIMEOUT_MINUTES):
        # Create workflow and check definition
        workflow = create_workflow_and_check_definition(
            workflow_graph=workflow_graph,
            workflow_name=unique_name_from_base(
                "integ-test-create-endpoint-config-step-workflow"),
            sfn_client=sfn_client,
            sfn_role_arn=sfn_role_arn)

        # Execute workflow
        execution = workflow.execute()
        execution_output = execution.get_output(wait=True)

        # Check workflow output
        assert execution_output.get("EndpointConfigArn") is not None
        assert execution_output["SdkHttpMetadata"]["HttpStatusCode"] == 200

        # Cleanup
        state_machine_delete_wait(sfn_client, workflow.state_machine_arn)
        delete_sagemaker_endpoint_config(endpoint_config_name,
                                         sagemaker_session)
        delete_sagemaker_model(model.name, sagemaker_session)
Ejemplo n.º 55
0
def test_marketplace_estimator(sagemaker_session, cpu_instance_type):
    with timeout(minutes=15):
        data_path = os.path.join(DATA_DIR, "marketplace", "training")
        region = sagemaker_session.boto_region_name
        account = REGION_ACCOUNT_MAP[region]
        algorithm_arn = ALGORITHM_ARN % (region, account)

        algo = AlgorithmEstimator(
            algorithm_arn=algorithm_arn,
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            sagemaker_session=sagemaker_session,
        )

        train_input = algo.sagemaker_session.upload_data(
            path=data_path, key_prefix="integ-test-data/marketplace/train")

        algo.fit({"training": train_input})

    endpoint_name = "test-marketplace-estimator{}".format(
        sagemaker_timestamp())
    with timeout_and_delete_endpoint_by_name(endpoint_name,
                                             sagemaker_session,
                                             minutes=20):
        predictor = algo.deploy(1,
                                cpu_instance_type,
                                endpoint_name=endpoint_name)
        shape = pandas.read_csv(os.path.join(data_path, "iris.csv"),
                                header=None)

        a = [50 * i for i in range(3)]
        b = [40 + i for i in range(10)]
        indices = [i + j for i, j in itertools.product(a, b)]

        test_data = shape.iloc[indices[:-1]]
        test_x = test_data.iloc[:, 1:]

        print(predictor.predict(test_x.values).decode("utf-8"))
def test_huggingface_training(
    sagemaker_session,
    gpu_instance_type,
    huggingface_training_latest_version,
    huggingface_pytorch_version,
):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "huggingface")

        hf = HuggingFace(
            py_version="py36",
            entry_point="examples/text-classification/run_glue.py",
            role="SageMakerRole",
            transformers_version=huggingface_training_latest_version,
            pytorch_version=huggingface_pytorch_version,
            instance_count=1,
            instance_type=gpu_instance_type,
            hyperparameters={
                "model_name_or_path": "distilbert-base-cased",
                "task_name": "wnli",
                "do_train": True,
                "do_eval": True,
                "max_seq_length": 128,
                "fp16": True,
                "per_device_train_batch_size": 128,
                "output_dir": "/opt/ml/model",
            },
            sagemaker_session=sagemaker_session,
            git_config={
                "repo": "https://github.com/huggingface/transformers.git",
                "branch": f"v{huggingface_training_latest_version}",
            },
        )

        train_input = hf.sagemaker_session.upload_data(
            path=os.path.join(data_path, "train"), key_prefix="integ-test-data/huggingface/train"
        )

        hf.fit(train_input)
Ejemplo n.º 57
0
def test_smdataparallel_tf_mnist(
    sagemaker_session,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    job_name = sagemaker.utils.unique_name_from_base("tf-sm-distributed-dataparallel")
    estimator = TensorFlow(
        entry_point="mnist_tf.py",
        role="SageMakerRole",
        source_dir=smdataparallel_dir,
        instance_count=2,
        instance_type="ml.p3.16xlarge",
        sagemaker_session=sagemaker_session,
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        distribution={
            "smdistributed": {"dataparallel": {"enabled": True, "custom_mpi_options": "--verbose"}}
        },
    )

    with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(job_name=job_name)
Ejemplo n.º 58
0
def test_kmeans_airflow_config_uploads_data_source_to_s3(
        sagemaker_session, cpu_instance_type):
    with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS):
        data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz")
        pickle_args = {} if sys.version_info.major == 2 else {
            "encoding": "latin1"
        }

        # Load the data into memory as numpy arrays
        with gzip.open(data_path, "rb") as f:
            train_set, _, _ = pickle.load(f, **pickle_args)

        kmeans = KMeans(
            role=ROLE,
            train_instance_count=SINGLE_INSTANCE_COUNT,
            train_instance_type=cpu_instance_type,
            k=10,
            sagemaker_session=sagemaker_session,
        )

        kmeans.init_method = "random"
        kmeans.max_iterations = 1
        kmeans.tol = 1
        kmeans.num_trials = 1
        kmeans.local_init_method = "kmeans++"
        kmeans.half_life_time_size = 1
        kmeans.epochs = 1
        kmeans.center_factor = 1
        kmeans.eval_metrics = ["ssd", "msd"]

        records = kmeans.record_set(train_set[0][:100])

        training_config = _build_airflow_workflow(kmeans, records)

        _assert_that_s3_url_contains_data(
            sagemaker_session,
            training_config["InputDataConfig"][0]["DataSource"]["S3DataSource"]
            ["S3Uri"],
        )
Ejemplo n.º 59
0
def test_tuning_mxnet(sagemaker_session, mxnet_full_version):
    with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'mxnet_mnist', 'mnist.py')
        data_path = os.path.join(DATA_DIR, 'mxnet_mnist')

        estimator = MXNet(entry_point=script_path,
                          role='SageMakerRole',
                          py_version=PYTHON_VERSION,
                          train_instance_count=1,
                          train_instance_type='ml.m4.xlarge',
                          framework_version=mxnet_full_version,
                          sagemaker_session=sagemaker_session)

        hyperparameter_ranges = {'learning-rate': ContinuousParameter(0.01, 0.2)}
        objective_metric_name = 'Validation-accuracy'
        metric_definitions = [
            {'Name': 'Validation-accuracy', 'Regex': 'Validation-accuracy=([0-9\\.]+)'}]
        tuner = HyperparameterTuner(estimator, objective_metric_name, hyperparameter_ranges,
                                    metric_definitions,
                                    max_jobs=4, max_parallel_jobs=2)

        train_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'train'),
                                                              key_prefix='integ-test-data/mxnet_mnist/train')
        test_input = estimator.sagemaker_session.upload_data(path=os.path.join(data_path, 'test'),
                                                             key_prefix='integ-test-data/mxnet_mnist/test')

        tuning_job_name = unique_name_from_base('tune-mxnet', max_length=32)
        tuner.fit({'train': train_input, 'test': test_input}, job_name=tuning_job_name)

        print('Started hyperparameter tuning job with name:' + tuning_job_name)

        time.sleep(15)
        tuner.wait()

    best_training_job = tuner.best_training_job()
    with timeout_and_delete_endpoint_by_name(best_training_job, sagemaker_session):
        predictor = tuner.deploy(1, 'ml.c4.xlarge')
        data = np.zeros(shape=(1, 1, 28, 28))
        predictor.predict(data)
Ejemplo n.º 60
0
def test_ntm(sagemaker_session, cpu_instance_type):
    job_name = unique_name_from_base("ntm")

    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        data_path = os.path.join(DATA_DIR, "ntm")
        data_filename = "nips-train_1.pbr"

        with open(os.path.join(data_path, data_filename), "rb") as f:
            all_records = read_records(f)

        # all records must be same
        feature_num = int(
            all_records[0].features["values"].float32_tensor.shape[0])

        ntm = NTM(
            role="SageMakerRole",
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            num_topics=10,
            sagemaker_session=sagemaker_session,
        )

        record_set = prepare_record_set_from_local_files(
            data_path, ntm.data_location, len(all_records), feature_num,
            sagemaker_session)
        ntm.fit(records=record_set, job_name=job_name)

    with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session):
        model = NTMModel(ntm.model_data,
                         role="SageMakerRole",
                         sagemaker_session=sagemaker_session)
        predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name)

        predict_input = np.random.rand(1, feature_num)
        result = predictor.predict(predict_input)

        assert len(result) == 1
        for record in result:
            assert record.label["topic_weights"] is not None