Example #1
0
def test_monitored_session(script_mode):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    json_file_contents = """
            {
                "S3OutputPath": "s3://sagemaker-test",
                "LocalPath": "/opt/ml/output/tensors",
                "HookParameters" : {
                    "save_interval": "100"
                }
            }
            """
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        train_op, X, Y = get_train_op_and_placeholders()
        init = tf.global_variables_initializer()
        mnist = get_data()

        if script_mode:
            hook = smd.SessionHook(out_dir=sim.out_dir)
            sess = tf.train.MonitoredSession(hooks=[hook])
        else:
            sess = tf.train.MonitoredSession()

        with sess:
            sess.run(init)
            for step in range(1, 101):
                batch_x, batch_y = mnist.train.next_batch(32)
                sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
Example #2
0
def test_keras_gradients(script_mode, tf_optimizer):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    tf.keras.backend.clear_session()
    json_file_contents = """
            {
                "S3OutputPath": "s3://sagemaker-test",
                "LocalPath": "/opt/ml/output/tensors",
                "CollectionConfigurations": [
                    {
                        "CollectionName": "gradients"
                    },
                    {
                        "CollectionName": "optimizer_variables"
                    },
                    {
                        "CollectionName": "losses"
                    }
                ]
            }
            """
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        model = get_keras_model_v1()
        (x_train, y_train), (x_test, y_test) = get_keras_data()

        if tf_optimizer:
            opt = tf.train.RMSPropOptimizer(0.1)
        else:
            opt = tf.keras.optimizers.RMSprop()

        if script_mode:
            hook = smd.KerasHook(
                out_dir=sim.out_dir,
                include_collections=["gradients", "optimizer_variables", "losses"],
            )
            opt = hook.wrap_optimizer(opt)
            model.compile(
                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
            )
            history = model.fit(
                x_train, y_train, batch_size=16, epochs=5, validation_split=0.2, callbacks=[hook]
            )
            test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook])
        else:
            model.compile(
                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
            )
            history = model.fit(x_train, y_train, batch_size=16, epochs=5, validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert len(trial.tensor_names(collection="gradients")) > 0
        if not tf_optimizer:
            # as this is only supported for keras optimizers currently
            assert len(trial.tensor_names(collection="optimizer_variables")) > 0
def test_linear_classifier(script_mode: bool):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    with SagemakerSimulator() as sim:
        # Setup
        train_input_fn, eval_input_fn = get_input_fns()
        x_feature = tf.feature_column.numeric_column("x", shape=(28, 28))
        estimator = tf.compat.v1.estimator.LinearClassifier(
            feature_columns=[x_feature],
            model_dir="/tmp/mnist_linear_classifier",
            n_classes=10)

        # Train
        if script_mode:
            hook = smd.EstimatorHook(out_dir=sim.out_dir)
            estimator.train(input_fn=train_input_fn, steps=100, hooks=[hook])
        else:
            estimator.train(input_fn=train_input_fn, steps=100)

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
Example #4
0
def test_keras_v1(script_mode):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    tf.keras.backend.clear_session()
    with SagemakerSimulator() as sim:
        model = get_keras_model_v1()
        (x_train, y_train), (x_test, y_test) = get_keras_data()

        model.compile(
            loss="sparse_categorical_crossentropy",
            optimizer=tf.keras.optimizers.RMSprop(),
            metrics=["accuracy"],
        )
        if script_mode:
            hook = smd.KerasHook(out_dir=sim.out_dir)
            history = model.fit(
                x_train, y_train, batch_size=64, epochs=5, validation_split=0.2, callbacks=[hook]
            )
            test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook])
        else:
            history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
def test_estimator(script_mode: bool):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    with SagemakerSimulator() as sim:
        # Setup
        mnist_classifier = get_estimator()
        train_input_fn, eval_input_fn = get_input_fns()

        # Train and evaluate
        train_steps, eval_steps = 80, 20
        if script_mode:
            hook = smd.EstimatorHook(out_dir=sim.out_dir)
            hook.set_mode(mode=smd.modes.TRAIN)
            mnist_classifier.train(input_fn=train_input_fn,
                                   steps=train_steps,
                                   hooks=[hook])
            hook.set_mode(mode=smd.modes.EVAL)
            mnist_classifier.evaluate(input_fn=eval_input_fn,
                                      steps=eval_steps,
                                      hooks=[hook])
        else:
            mnist_classifier.train(input_fn=train_input_fn, steps=train_steps)
            mnist_classifier.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        print(trial)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert trial.steps() == [0, train_steps], "Wrong step count for trial."
Example #6
0
def test_tensorboard_dir_sagemaker():
    """ In Sagemaker, we read the tensorboard_dir from a separate JSON config file. """
    with SagemakerSimulator() as sim:
        smd.del_hook()
        hook = smd.get_hook(create_if_not_exists=True)
        assert hook.out_dir == sim.out_dir
        assert hook.tensorboard_dir == sim.tensorboard_dir
def test_monitored_session(script_mode: bool):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    with SagemakerSimulator() as sim:
        train_op, X, Y = get_train_op_and_placeholders()
        init = tf.compat.v1.global_variables_initializer()
        mnist = get_data()

        if script_mode:
            hook = smd.SessionHook(out_dir=sim.out_dir)
            sess = tf.train.MonitoredSession(hooks=[hook])
        else:
            sess = tf.train.MonitoredSession()

        with sess:
            sess.run(init)
            for step in range(1, 101):
                batch_x, batch_y = mnist.train.next_batch(32)
                sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
Example #8
0
def helper_test_keras_v2(script_mode: bool = False, eager_mode: bool = True):
    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
    smd.del_hook()
    tf.keras.backend.clear_session()
    if not eager_mode:
        tf.compat.v1.disable_eager_execution()
    with SagemakerSimulator() as sim:
        model = get_keras_model_v2()
        (x_train, y_train), (x_test, y_test) = get_keras_data()
        x_train, x_test = x_train / 255, x_test / 255

        opt = tf.keras.optimizers.RMSprop()
        if script_mode:
            hook = smd.KerasHook(out_dir=sim.out_dir, export_tensorboard=True)
            opt = hook.wrap_optimizer(opt)
            model.compile(loss="sparse_categorical_crossentropy",
                          optimizer=opt,
                          metrics=["accuracy"])
            history = model.fit(x_train,
                                y_train,
                                batch_size=64,
                                epochs=2,
                                validation_split=0.2,
                                callbacks=[hook])
            test_scores = model.evaluate(x_test,
                                         y_test,
                                         verbose=2,
                                         callbacks=[hook])
        else:
            model.compile(loss="sparse_categorical_crossentropy",
                          optimizer=opt,
                          metrics=["accuracy"])
            history = model.fit(x_train,
                                y_train,
                                batch_size=64,
                                epochs=2,
                                validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        hook = smd.get_hook()
        assert hook
        hook.close()
        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."

        # DEFAULT TENSORS SAVED
        assert len(trial.tensor_names(
            collection=CollectionKeys.LOSSES)) > 0, "No Losses Saved"
        assert len(trial.tensor_names(
            collection=CollectionKeys.METRICS)) > 0, "No Metrics Saved"
        assert (len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 0
                ), "Weights were not expected to be saved by default"
        assert (len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0
                ), "Biases were not expected to be saved by default"
def helper_test_keras_v2_json_config(json_file_contents,
                                     script_mode: bool = False,
                                     eager_mode: bool = True):
    """ Tests ZCC with custom hook configs """
    smd.del_hook()
    tf.keras.backend.clear_session()
    if not eager_mode:
        tf.compat.v1.disable_eager_execution()
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        model = get_keras_model_v2()
        (x_train, y_train), (x_test, y_test) = get_keras_data()
        x_train, x_test = x_train / 255, x_test / 255

        opt = tf.keras.optimizers.RMSprop()
        if script_mode:
            hook = smd.KerasHook.create_from_json_file()
            opt = hook.wrap_optimizer(opt)
            model.compile(loss="sparse_categorical_crossentropy",
                          optimizer=opt,
                          metrics=["accuracy"])
            history = model.fit(x_train,
                                y_train,
                                batch_size=64,
                                epochs=2,
                                validation_split=0.2,
                                callbacks=[hook])
            test_scores = model.evaluate(x_test,
                                         y_test,
                                         verbose=2,
                                         callbacks=[hook])
        else:
            model.compile(loss="sparse_categorical_crossentropy",
                          optimizer=opt,
                          metrics=["accuracy"])
            history = model.fit(x_train,
                                y_train,
                                epochs=2,
                                batch_size=64,
                                validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        hook = smd.get_hook()
        assert hook
        hook.close()
        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        if not eager_mode:
            assert len(trial.tensor_names(collection="gradients")) > 0
        assert len(trial.tensor_names(collection="weights")) > 0
        assert len(trial.tensor_names(collection="losses")) > 0
Example #10
0
def test_keras_to_estimator(script_mode):
    """ Works as intended. """
    import tensorflow.compat.v1.keras as keras

    tf.reset_default_graph()
    smd.del_hook()
    keras.backend.clear_session()
    with SagemakerSimulator() as sim:
        model = keras.models.Sequential([
            keras.layers.Dense(16, activation="relu", input_shape=(4, )),
            keras.layers.Dropout(0.2),
            keras.layers.Dense(1, activation="sigmoid"),
        ])

        def input_fn():
            split = tfds.Split.TRAIN
            data_dir = TEST_DATASET_S3_PATH if use_s3_datasets() else None
            dataset = tfds.load("iris",
                                data_dir=data_dir,
                                split=split,
                                as_supervised=True)
            dataset = dataset.map(lambda features, labels:
                                  ({
                                      "dense_input": features
                                  }, labels))
            dataset = dataset.batch(32).repeat()
            return dataset

        model.compile(loss="categorical_crossentropy", optimizer="adam")
        model.summary()

        keras_estimator = tf.keras.estimator.model_to_estimator(
            keras_model=model, model_dir=sim.out_dir)

        if script_mode:
            hook = smd.EstimatorHook(sim.out_dir)
            hook.set_mode(smd.modes.TRAIN)
            keras_estimator.train(input_fn=input_fn, steps=25, hooks=[hook])
            hook.set_mode(smd.modes.EVAL)
            eval_result = keras_estimator.evaluate(input_fn=input_fn,
                                                   steps=10,
                                                   hooks=[hook])
        else:
            keras_estimator.train(input_fn=input_fn, steps=25)
            keras_estimator.evaluate(input_fn=input_fn, steps=10)

        tr = smd.create_trial(sim.out_dir)
        assert len(tr.tensor_names()) == 1
        assert tr.steps() == [0, 25]
        assert len(tr.steps(smd.modes.TRAIN)) == 1
        assert len(tr.steps(smd.modes.EVAL)) == 1
Example #11
0
def test_temp_paths():
    with SagemakerSimulator() as sim:
        for path in [
                "/opt/ml/output/tensors/events/a",
                "/opt/ml/output/tensors/a",
                "/opt/ml/output/tensors/events/a/b",
        ]:
            temp_path = get_temp_path(path)
            assert temp_path.endswith(SMDEBUG_TEMP_PATH_SUFFIX)

    with ScriptSimulator() as sim:
        for path in ["/a/b/c", "/opt/ml/output/a", "a/b/c"]:
            temp_path = get_temp_path(path)
            assert temp_path.endswith(SMDEBUG_TEMP_PATH_SUFFIX)
Example #12
0
def helper_test_keras_v2_gradienttape(script_mode: bool = False,
                                      json_file_contents="{}",
                                      default=False):
    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
    smd.del_hook()
    tf.keras.backend.clear_session()

    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        helper_keras_gradienttape_train(script_mode=script_mode,
                                        json_file_contents=json_file_contents,
                                        sim=sim)
        hook = smd.get_hook()

        if script_mode:
            assert hook
            if default:
                assert hook.has_default_hook_configuration()
            hook.close()
            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert len(trial.tensor_names(collection="losses")) > 0
        else:
            if version.parse(tf.__version__) < version.parse("2.1.2"):
                assert not hook  # only supported on TF 2.1.2 and greater
                return
            assert hook
            hook.close()
            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert len(trial.tensor_names(collection="losses")) > 0
            if is_tf_2_2() and default is False:
                # Inputs and Outputs are not saved with the default collection configurations.
                assert len(trial.tensor_names(collection="inputs")) > 0
                assert len(trial.tensor_names(collection="outputs")) > 0
                assert trial.tensor_names(collection="outputs") == [
                    "predictions"
                ]
                if "dense_layers" in json_file_contents:
                    # Only assert for test_keras_v2_multi_collections
                    # which defines this custom collection
                    assert len(
                        trial.tensor_names(collection="dense_layers")) > 0
                else:
                    assert len(
                        trial.tensor_names(collection="dense_layers")) == 0
Example #13
0
def test_temp_paths():
    with SagemakerSimulator() as sim:
        for path in [
                "/opt/ml/output/tensors/events/a",
                "/opt/ml/output/tensors/a",
                "/opt/ml/output/tensors/events/a/b",
        ]:
            temp_path = get_temp_path(path)
            assert temp_path.endswith(SAGEMAKER_TEMP_PATH_SUFFIX)
            assert not temp_path.startswith(NON_SAGEMAKER_TEMP_PATH_PREFIX)

    with ScriptSimulator() as sim:
        for path in ["/a/b/c", "/opt/ml/output/a", "a/b/c"]:
            temp_path = get_temp_path(path)
            assert not SAGEMAKER_TEMP_PATH_SUFFIX in temp_path
            assert temp_path.startswith(NON_SAGEMAKER_TEMP_PATH_PREFIX)
Example #14
0
def test_estimator(script_mode):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    with SagemakerSimulator() as sim:
        train_steps, eval_steps = 80, 20
        helper_train(
            script_mode=script_mode, sim=sim, train_steps=train_steps, eval_steps=eval_steps
        )

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        print(trial)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert trial.steps() == [0, train_steps], "Wrong step count for trial."
Example #15
0
def test_outdir_sagemaker(monkeypatch):
    with TemporaryDirectory() as dir_name:
        json_file_contents = f"""
                {{
                    "S3OutputPath": "s3://sagemaker-test",
                    "LocalPath": "{dir_name}",
                    "HookParameters" : {{
                        "save_interval": "2",
                        "include_workers": "all"
                    }}
                }}
                """
        from smdebug.tensorflow import get_hook

        with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
            hook = get_hook("keras", create_if_not_exists=True)
            assert hook.out_dir == dir_name
def test_keras_gradients_mirrored(include_workers="one"):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    tf.keras.backend.clear_session()
    json_file_contents_p1 = """
            {
                "S3OutputPath": "s3://sagemaker-test",
                "LocalPath": "/opt/ml/output/tensors",
                "HookParameters" : {

            """
    json_file_contents_p2 = f'"include_workers": "{include_workers}",'
    json_file_contents_p3 = """
                    "save_interval": "3"
                },
                "CollectionConfigurations": [
                    {
                        "CollectionName": "gradients"
                    },
                    {
                        "CollectionName": "optimizer_variables"
                    },
                    {
                        "CollectionName": "losses"
                    },
                    {
                        "CollectionName": "weights"
                    },
                    {
                        "CollectionName": "biases"
                    },
                    {
                        "CollectionName": "outputs"
                    },
                    {
                        "CollectionName": "metrics"
                    }
                ]
            }
            """
    json_file_contents = json_file_contents_p1 + json_file_contents_p2 + json_file_contents_p3
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        test_tf_keras("/opt/ml/output/tensors",
                      zcc=True,
                      include_workers=include_workers)
def test_json_params_sagemaker():
    with SagemakerSimulator() as sim:
        params_dict = get_json_config_as_dict(
            json_config_path="tests/core/json_configs/all_params.json")
        hook_params = collect_hook_config_params(params_dict)
        include_collections = get_include_collections(params_dict)
        coll_manager = CollectionManager()
        add_collections_to_manager(coll_manager, params_dict, hook_params)
        assert hook_params["include_workers"] == "one"
        assert hook_params["save_all"] is True
        assert coll_manager.get("weights").save_histogram is False
        assert coll_manager.get("gradients").save_histogram is False
        assert "weights" in include_collections
        assert "gradients" in include_collections
        assert len(include_collections) == 2
        assert hook_params["export_tensorboard"] == True
        assert hook_params["tensorboard_dir"] == sim.tensorboard_dir
Example #18
0
def test_integration_mxnet():

    json_file_contents = """
        {
            "S3OutputPath": "s3://sagemaker-test",
            "LocalPath": "/tmp/mxnet_integ_test",
            "CollectionConfigurations": [
                {
                    "CollectionName": "losses",
                    "CollectionParameters": {
                        "save_interval": 100
                    }
                }
            ]
        }
        """
    with SagemakerSimulator(json_file_contents=json_file_contents) as _:
        train_model()
        validate()
def test_integration_mxnet():
    # DEFAULT CONSTANTS
    batch_size = 256
    epochs = 1
    learning_rate = 0.1
    mx.random.seed(128)
    random.seed(12)
    np.random.seed(2)

    context = mx.cpu()
    # Create a Gluon Model.
    net = create_gluon_model()

    # Start the training.
    train_data, val_data = prepare_data(batch_size)

    json_file_contents = """
        {
            "S3OutputPath": "s3://sagemaker-test",
            "LocalPath": "/tmp/mxnet_integ_test",
            "CollectionConfigurations": [
                {
                    "CollectionName": "losses",
                    "CollectionParameters": {
                        "save_interval": 100
                    }
                }
            ]
        }
        """
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        train_model(
            net=net,
            epochs=epochs,
            ctx=context,
            learning_rate=learning_rate,
            momentum=0.9,
            train_data=train_data,
            val_data=val_data,
        )

        validate()
def main():
    opt = parse_args()
    mx.random.seed(128)
    random.seed(12)
    np.random.seed(2)

    context = mx.cpu() if opt.context.lower() == "cpu" else mx.gpu()
    # Create a Gluon Model.
    net = create_gluon_model()

    # Start the training.
    train_data, val_data = prepare_data(opt.batch_size)

    json_file_contents = """
        {
            "S3OutputPath": "s3://sagemaker-test",
            "LocalPath": "/tmp/mxnet_integ_test",
            "CollectionConfigurations": [
                {
                    "CollectionName": "losses",
                    "CollectionParameters": {
                        "save_interval": 100
                    }
                }
            ]
        }
        """
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        train_model(
            net=net,
            epochs=opt.epochs,
            ctx=context,
            learning_rate=opt.learning_rate,
            momentum=0.9,
            train_data=train_data,
            val_data=val_data,
        )
        if opt.validate:
            validate()
def test_monitored_session_gradients_zcc():
    """ Works as intended. """
    smd.del_hook()
    json_file_contents = """
    {
        "S3OutputPath": "s3://sagemaker-test",
        "LocalPath": "/opt/ml/output/tensors",
        "CollectionConfigurations": [
            {
                "CollectionName": "gradients"
            },
            {
                "CollectionName": "losses"
            }
        ]
    }
    """
    tf.reset_default_graph()
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        train_op, X, Y = get_train_op_and_placeholders()
        init = tf.compat.v1.global_variables_initializer()
        mnist = get_data()

        sess = tf.train.MonitoredSession()

        with sess:
            sess.run(init)
            for step in range(1, 101):
                batch_x, batch_y = mnist.train.next_batch(32)
                sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert len(trial.tensor_names(collection="gradients")) > 0
def test_sagemaker():
    json_file_contents = """
{
    "S3OutputPath": "s3://sagemaker-test",
    "LocalPath": "/opt/ml/output/tensors",
    "HookParameters": null,
    "CollectionConfigurations": [
        {
            "CollectionName": "weights",
            "CollectionParameters": null
        },
        {
            "CollectionName": "losses",
            "CollectionParameters": null
        }
    ],
    "DebugHookSpecification": null
}
"""
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        smd.del_hook()
        hook = smd.get_hook(hook_type="session", create_if_not_exists=True)
        print(hook)
        assert "weights" in hook.include_collections, hook
def test_estimator_gradients_zcc(nested=False, mirrored=False):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    json_file_contents = """
        {
            "S3OutputPath": "s3://sagemaker-test",
            "LocalPath": "/opt/ml/output/tensors",
            "HookParameters" : {
                "save_interval": "2",
                "include_workers": "all"
            },
            "CollectionConfigurations": [
                {
                    "CollectionName": "gradients"
                },
                {
                    "CollectionName": "weights"
                },
                {
                    "CollectionName": "losses"
                },
                {
                    "CollectionName": "biases"
                }
            ]
        }
        """
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:

        if mirrored:
            test_basic("/opt/ml/output/tensors", zcc=True)
        else:
            # Setup
            mnist_classifier = get_estimator(nested_optimizer=nested,
                                             mirrored=mirrored)
            train_input_fn, eval_input_fn = get_input_fns()

            # Train and evaluate
            train_steps, eval_steps = 10, 10
            mnist_classifier.train(input_fn=train_input_fn, steps=train_steps)
            mnist_classifier.evaluate(input_fn=eval_input_fn, steps=eval_steps)

            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            print(trial)
            assert smd.get_hook() is not None, "Hook was not created."
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert trial.steps() == [
                0,
                2,
                4,
                6,
                8,
                10,
                12,
                14,
                16,
                18,
            ], "Wrong step count for trial."
            print(trial.tensor_names(collection="gradients"))
            assert len(trial.tensor_names(collection="gradients")) > 0
            assert len(trial.tensor_names(collection="weights")) > 0
            assert len(trial.tensor_names(collection="losses")) > 0
            assert len(
                trial.tensor(
                    trial.tensor_names(collection="gradients")[0]).steps()) > 0
            assert len(trial.modes()) == 2
Example #24
0
def helper_test_keras_v2_gradienttape(script_mode: bool = False,
                                      json_file_contents="{}",
                                      default=False):
    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
    smd.del_hook()
    tf.keras.backend.clear_session()

    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28,
                                                 1)),  # WA for TF issue #36279
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10, activation="softmax"),
        ])
        (x_train, y_train), _ = get_keras_data()
        dataset = tf.data.Dataset.from_tensor_slices(
            (tf.cast(x_train[..., tf.newaxis] / 255,
                     tf.float32), tf.cast(y_train, tf.int64)))
        dataset = dataset.shuffle(1000).batch(64)

        opt = tf.keras.optimizers.RMSprop()
        cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
        n_epochs = 1
        if script_mode:
            if json_file_contents == "{}":
                hook = smd.KerasHook(out_dir=sim.out_dir,
                                     export_tensorboard=True)
            else:
                hook = smd.KerasHook.create_from_json_file()

            for epoch in range(n_epochs):
                print("Epoch %d/%d" % (epoch + 1, n_epochs))
                for data, labels in dataset:
                    dataset_labels = labels
                    labels = tf.one_hot(labels, depth=10)
                    with hook.wrap_tape(tf.GradientTape()) as tape:
                        logits = model(data, training=True)  # (32,10)
                        loss_value = cce(labels, logits)
                    grads = tape.gradient(loss_value, model.variables)
                    opt.apply_gradients(zip(grads, model.variables))
                    acc = train_acc_metric(dataset_labels, logits)
                    hook.save_tensor(tensor_name="accuracy",
                                     tensor_value=acc,
                                     collections_to_write="metrics")
                log = "Epoch %d " % (epoch + 1)
                log += "Accuracy %.4f" % train_acc_metric.result()
                print(log)
                train_acc_metric.reset_states()
            hook = smd.get_hook()
            assert hook
            if default:
                assert hook.has_default_hook_configuration()
            hook.close()
            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert len(trial.tensor_names(collection="losses")) > 0
        else:
            # ZCC support added from smdebug v0.8.0)
            for epoch in range(n_epochs):
                print("Epoch %d/%d" % (epoch + 1, n_epochs))
                for data, labels in dataset:
                    dataset_labels = labels
                    labels = tf.one_hot(labels, depth=10)
                    with tf.GradientTape(persistent=True) as tape:
                        logits = model(data, training=True)  # (32,10)
                        loss_value = cce(labels, logits)
                    grads = tape.gradient(loss_value, model.variables)
                    opt.apply_gradients(zip(grads, model.variables))
                    acc = train_acc_metric(dataset_labels, logits)
                log = "Epoch %d " % (epoch + 1)
                log += "Accuracy %.4f" % train_acc_metric.result()
                print(log)
                train_acc_metric.reset_states()
            hook = smd.get_hook()
            if not (is_tf_2_2() or is_tf_2_3()):
                assert not hook  # only supported on TF 2.2 and greater
                return
            assert hook
            hook.close()
            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert len(trial.tensor_names(collection="losses")) > 0
            if is_tf_2_2() and default is False:
                # Inputs and Outputs are not saved with the default collection configurations.
                assert len(trial.tensor_names(collection="inputs")) > 0
                assert len(trial.tensor_names(collection="outputs")) > 0
                assert trial.tensor_names(collection="outputs") == [
                    "predictions"
                ]
                if "dense_layers" in json_file_contents:
                    # Only assert for test_keras_v2_multi_collections
                    # which defines this custom collection
                    assert len(
                        trial.tensor_names(collection="dense_layers")) > 0
                else:
                    assert len(
                        trial.tensor_names(collection="dense_layers")) == 0
def helper_test_keras_v2_gradienttape(script_mode: bool = False,
                                      json_file_contents="{}"):
    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
    smd.del_hook()
    tf.keras.backend.clear_session()

    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28,
                                                 1)),  # WA for TF issue #36279
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10, activation="softmax"),
        ])
        (x_train, y_train), _ = get_keras_data()
        dataset = tf.data.Dataset.from_tensor_slices(
            (tf.cast(x_train[..., tf.newaxis] / 255,
                     tf.float32), tf.cast(y_train, tf.int64)))
        dataset = dataset.shuffle(1000).batch(64)

        opt = tf.keras.optimizers.RMSprop()
        cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
        n_epochs = 2
        if script_mode:
            if json_file_contents == "{}":
                hook = smd.KerasHook(out_dir=sim.out_dir,
                                     export_tensorboard=True)
            else:
                hook = smd.KerasHook.create_from_json_file()

            for epoch in range(n_epochs):
                print("Epoch %d/%d" % (epoch + 1, n_epochs))
                for data, labels in dataset:
                    dataset_labels = labels
                    labels = tf.one_hot(labels, depth=10)
                    with hook.wrap_tape(tf.GradientTape()) as tape:
                        logits = model(data, training=True)  # (32,10)
                        loss_value = cce(labels, logits)
                    grads = tape.gradient(loss_value, model.variables)
                    opt.apply_gradients(zip(grads, model.variables))
                    acc = train_acc_metric(dataset_labels, logits)
                    hook.record_tensor_value(tensor_name="accuracy",
                                             tensor_value=acc)
                log = "Epoch %d " % (epoch + 1)
                log += "Accuracy %.4f" % train_acc_metric.result()
                print(log)
                train_acc_metric.reset_states()
            hook = smd.get_hook()
            assert hook
            hook.close()
            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert len(trial.tensor_names(collection="losses")) > 0
        else:
            # ZCC doesn't support yet (as of smdebug v0.7.2)
            for epoch in range(n_epochs):
                print("Epoch %d/%d" % (epoch + 1, n_epochs))
                for data, labels in dataset:
                    dataset_labels = labels
                    labels = tf.one_hot(labels, depth=10)
                    with tf.GradientTape(persistent=True) as tape:
                        logits = model(data, training=True)  # (32,10)
                        loss_value = cce(labels, logits)
                    grads = tape.gradient(loss_value, model.variables)
                    opt.apply_gradients(zip(grads, model.variables))
                    acc = train_acc_metric(dataset_labels, logits)
                log = "Epoch %d " % (epoch + 1)
                log += "Accuracy %.4f" % train_acc_metric.result()
                print(log)
                train_acc_metric.reset_states()
            hook = smd.get_hook()
            assert not hook