def test_linear_classifier(script_mode: bool):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    with SagemakerSimulator() as sim:
        # Setup
        train_input_fn, eval_input_fn = get_input_fns()
        x_feature = tf.feature_column.numeric_column("x", shape=(28, 28))
        estimator = tf.compat.v1.estimator.LinearClassifier(
            feature_columns=[x_feature],
            model_dir="/tmp/mnist_linear_classifier",
            n_classes=10)

        # Train
        if script_mode:
            hook = smd.EstimatorHook(out_dir=sim.out_dir)
            estimator.train(input_fn=train_input_fn, steps=100, hooks=[hook])
        else:
            estimator.train(input_fn=train_input_fn, steps=100)

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
Ejemplo n.º 2
0
def test_keras_v1(script_mode):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    tf.keras.backend.clear_session()
    with SagemakerSimulator() as sim:
        model = get_keras_model_v1()
        (x_train, y_train), (x_test, y_test) = get_keras_data()

        model.compile(
            loss="sparse_categorical_crossentropy",
            optimizer=tf.keras.optimizers.RMSprop(),
            metrics=["accuracy"],
        )
        if script_mode:
            hook = smd.KerasHook(out_dir=sim.out_dir)
            history = model.fit(
                x_train, y_train, batch_size=64, epochs=5, validation_split=0.2, callbacks=[hook]
            )
            test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook])
        else:
            history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
Ejemplo n.º 3
0
def test_monitored_session(script_mode):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    json_file_contents = """
            {
                "S3OutputPath": "s3://sagemaker-test",
                "LocalPath": "/opt/ml/output/tensors",
                "HookParameters" : {
                    "save_interval": "100"
                }
            }
            """
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        train_op, X, Y = get_train_op_and_placeholders()
        init = tf.global_variables_initializer()
        mnist = get_data()

        if script_mode:
            hook = smd.SessionHook(out_dir=sim.out_dir)
            sess = tf.train.MonitoredSession(hooks=[hook])
        else:
            sess = tf.train.MonitoredSession()

        with sess:
            sess.run(init)
            for step in range(1, 101):
                batch_x, batch_y = mnist.train.next_batch(32)
                sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
def test_monitored_session(script_mode: bool):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    with SagemakerSimulator() as sim:
        train_op, X, Y = get_train_op_and_placeholders()
        init = tf.compat.v1.global_variables_initializer()
        mnist = get_data()

        if script_mode:
            hook = smd.SessionHook(out_dir=sim.out_dir)
            sess = tf.train.MonitoredSession(hooks=[hook])
        else:
            sess = tf.train.MonitoredSession()

        with sess:
            sess.run(init)
            for step in range(1, 101):
                batch_x, batch_y = mnist.train.next_batch(32)
                sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
def test_estimator(script_mode: bool):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    with SagemakerSimulator() as sim:
        # Setup
        mnist_classifier = get_estimator()
        train_input_fn, eval_input_fn = get_input_fns()

        # Train and evaluate
        train_steps, eval_steps = 80, 20
        if script_mode:
            hook = smd.EstimatorHook(out_dir=sim.out_dir)
            hook.set_mode(mode=smd.modes.TRAIN)
            mnist_classifier.train(input_fn=train_input_fn,
                                   steps=train_steps,
                                   hooks=[hook])
            hook.set_mode(mode=smd.modes.EVAL)
            mnist_classifier.evaluate(input_fn=eval_input_fn,
                                      steps=eval_steps,
                                      hooks=[hook])
        else:
            mnist_classifier.train(input_fn=train_input_fn, steps=train_steps)
            mnist_classifier.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        print(trial)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert trial.steps() == [0, train_steps], "Wrong step count for trial."
Ejemplo n.º 6
0
def test_keras_gradients(script_mode, tf_optimizer):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    tf.keras.backend.clear_session()
    json_file_contents = """
            {
                "S3OutputPath": "s3://sagemaker-test",
                "LocalPath": "/opt/ml/output/tensors",
                "CollectionConfigurations": [
                    {
                        "CollectionName": "gradients"
                    },
                    {
                        "CollectionName": "optimizer_variables"
                    },
                    {
                        "CollectionName": "losses"
                    }
                ]
            }
            """
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        model = get_keras_model_v1()
        (x_train, y_train), (x_test, y_test) = get_keras_data()

        if tf_optimizer:
            opt = tf.train.RMSPropOptimizer(0.1)
        else:
            opt = tf.keras.optimizers.RMSprop()

        if script_mode:
            hook = smd.KerasHook(
                out_dir=sim.out_dir,
                include_collections=["gradients", "optimizer_variables", "losses"],
            )
            opt = hook.wrap_optimizer(opt)
            model.compile(
                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
            )
            history = model.fit(
                x_train, y_train, batch_size=16, epochs=5, validation_split=0.2, callbacks=[hook]
            )
            test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook])
        else:
            model.compile(
                loss="sparse_categorical_crossentropy", optimizer=opt, metrics=["accuracy"]
            )
            history = model.fit(x_train, y_train, batch_size=16, epochs=5, validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert len(trial.tensor_names(collection="gradients")) > 0
        if not tf_optimizer:
            # as this is only supported for keras optimizers currently
            assert len(trial.tensor_names(collection="optimizer_variables")) > 0
Ejemplo n.º 7
0
def helper_test_keras_v2(script_mode: bool = False, eager_mode: bool = True):
    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
    smd.del_hook()
    tf.keras.backend.clear_session()
    if not eager_mode:
        tf.compat.v1.disable_eager_execution()
    with SagemakerSimulator() as sim:
        model = get_keras_model_v2()
        (x_train, y_train), (x_test, y_test) = get_keras_data()
        x_train, x_test = x_train / 255, x_test / 255

        opt = tf.keras.optimizers.RMSprop()
        if script_mode:
            hook = smd.KerasHook(out_dir=sim.out_dir, export_tensorboard=True)
            opt = hook.wrap_optimizer(opt)
            model.compile(loss="sparse_categorical_crossentropy",
                          optimizer=opt,
                          metrics=["accuracy"])
            history = model.fit(x_train,
                                y_train,
                                batch_size=64,
                                epochs=2,
                                validation_split=0.2,
                                callbacks=[hook])
            test_scores = model.evaluate(x_test,
                                         y_test,
                                         verbose=2,
                                         callbacks=[hook])
        else:
            model.compile(loss="sparse_categorical_crossentropy",
                          optimizer=opt,
                          metrics=["accuracy"])
            history = model.fit(x_train,
                                y_train,
                                batch_size=64,
                                epochs=2,
                                validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        hook = smd.get_hook()
        assert hook
        hook.close()
        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."

        # DEFAULT TENSORS SAVED
        assert len(trial.tensor_names(
            collection=CollectionKeys.LOSSES)) > 0, "No Losses Saved"
        assert len(trial.tensor_names(
            collection=CollectionKeys.METRICS)) > 0, "No Metrics Saved"
        assert (len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 0
                ), "Weights were not expected to be saved by default"
        assert (len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0
                ), "Biases were not expected to be saved by default"
def helper_test_keras_v2_json_config(json_file_contents,
                                     script_mode: bool = False,
                                     eager_mode: bool = True):
    """ Tests ZCC with custom hook configs """
    smd.del_hook()
    tf.keras.backend.clear_session()
    if not eager_mode:
        tf.compat.v1.disable_eager_execution()
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        model = get_keras_model_v2()
        (x_train, y_train), (x_test, y_test) = get_keras_data()
        x_train, x_test = x_train / 255, x_test / 255

        opt = tf.keras.optimizers.RMSprop()
        if script_mode:
            hook = smd.KerasHook.create_from_json_file()
            opt = hook.wrap_optimizer(opt)
            model.compile(loss="sparse_categorical_crossentropy",
                          optimizer=opt,
                          metrics=["accuracy"])
            history = model.fit(x_train,
                                y_train,
                                batch_size=64,
                                epochs=2,
                                validation_split=0.2,
                                callbacks=[hook])
            test_scores = model.evaluate(x_test,
                                         y_test,
                                         verbose=2,
                                         callbacks=[hook])
        else:
            model.compile(loss="sparse_categorical_crossentropy",
                          optimizer=opt,
                          metrics=["accuracy"])
            history = model.fit(x_train,
                                y_train,
                                epochs=2,
                                batch_size=64,
                                validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        hook = smd.get_hook()
        assert hook
        hook.close()
        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        if not eager_mode:
            assert len(trial.tensor_names(collection="gradients")) > 0
        assert len(trial.tensor_names(collection="weights")) > 0
        assert len(trial.tensor_names(collection="losses")) > 0
Ejemplo n.º 9
0
def helper_test_keras_v2_gradienttape(script_mode: bool = False,
                                      json_file_contents="{}",
                                      default=False):
    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
    smd.del_hook()
    tf.keras.backend.clear_session()

    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        helper_keras_gradienttape_train(script_mode=script_mode,
                                        json_file_contents=json_file_contents,
                                        sim=sim)
        hook = smd.get_hook()

        if script_mode:
            assert hook
            if default:
                assert hook.has_default_hook_configuration()
            hook.close()
            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert len(trial.tensor_names(collection="losses")) > 0
        else:
            if version.parse(tf.__version__) < version.parse("2.1.2"):
                assert not hook  # only supported on TF 2.1.2 and greater
                return
            assert hook
            hook.close()
            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert len(trial.tensor_names(collection="losses")) > 0
            if is_tf_2_2() and default is False:
                # Inputs and Outputs are not saved with the default collection configurations.
                assert len(trial.tensor_names(collection="inputs")) > 0
                assert len(trial.tensor_names(collection="outputs")) > 0
                assert trial.tensor_names(collection="outputs") == [
                    "predictions"
                ]
                if "dense_layers" in json_file_contents:
                    # Only assert for test_keras_v2_multi_collections
                    # which defines this custom collection
                    assert len(
                        trial.tensor_names(collection="dense_layers")) > 0
                else:
                    assert len(
                        trial.tensor_names(collection="dense_layers")) == 0
Ejemplo n.º 10
0
def test_outdir_sagemaker(monkeypatch):
    with TemporaryDirectory() as dir_name:
        json_file_contents = f"""
                {{
                    "S3OutputPath": "s3://sagemaker-test",
                    "LocalPath": "{dir_name}",
                    "HookParameters" : {{
                        "save_interval": "2",
                        "include_workers": "all"
                    }}
                }}
                """
        from smdebug.tensorflow import get_hook

        with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
            hook = get_hook("keras", create_if_not_exists=True)
            assert hook.out_dir == dir_name
Ejemplo n.º 11
0
def test_estimator(script_mode):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    with SagemakerSimulator() as sim:
        train_steps, eval_steps = 80, 20
        helper_train(
            script_mode=script_mode, sim=sim, train_steps=train_steps, eval_steps=eval_steps
        )

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        print(trial)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert trial.steps() == [0, train_steps], "Wrong step count for trial."
def test_monitored_session_gradients_zcc():
    """ Works as intended. """
    smd.del_hook()
    json_file_contents = """
    {
        "S3OutputPath": "s3://sagemaker-test",
        "LocalPath": "/opt/ml/output/tensors",
        "CollectionConfigurations": [
            {
                "CollectionName": "gradients"
            },
            {
                "CollectionName": "losses"
            }
        ]
    }
    """
    tf.reset_default_graph()
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        train_op, X, Y = get_train_op_and_placeholders()
        init = tf.compat.v1.global_variables_initializer()
        mnist = get_data()

        sess = tf.train.MonitoredSession()

        with sess:
            sess.run(init)
            for step in range(1, 101):
                batch_x, batch_y = mnist.train.next_batch(32)
                sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})

        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert smd.get_hook() is not None, "Hook was not created."
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        assert len(trial.tensor_names(collection="gradients")) > 0
def helper_test_keras_v2(script_mode: bool = False, eager_mode: bool = True):
    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
    smd.del_hook()
    tf.keras.backend.clear_session()
    if not eager_mode and is_tf_2_3() is False and is_tf_2_2() is False:
        # v1 training APIs are currently not supported
        # in ZCC mode with smdebug 0.9 and AWS TF 2.3.0
        tf.compat.v1.disable_eager_execution()
    enable_tb = False if (tf.__version__ == "2.0.2" or is_tf_2_3()) else True
    run_eagerly = None
    if is_tf_2_2() or is_tf_2_3():
        run_eagerly = eager_mode
    with SagemakerSimulator(enable_tb=enable_tb) as sim:
        helper_keras_fit(script_mode=script_mode,
                         eager_mode=eager_mode,
                         run_eagerly=run_eagerly,
                         sim=sim)
        hook = smd.get_hook()
        assert hook
        # Check if the hook was executed with the default
        # hook configuration
        assert hook.has_default_hook_configuration()
        hook.close()
        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."

        # DEFAULT TENSORS SAVED
        assert len(trial.tensor_names(
            collection=CollectionKeys.LOSSES)) > 0, "No Losses Saved"
        assert len(trial.tensor_names(
            collection=CollectionKeys.METRICS)) > 0, "No Metrics Saved"
        assert (len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 0
                ), "Weights were not expected to be saved by default"
        assert (len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0
                ), "Biases were not expected to be saved by default"
Ejemplo n.º 14
0
def test_bert_simple():
    # Test bert with the default smdebug configuration
    smd.del_hook()
    with SagemakerSimulator(enable_tb=False) as sim:
        epochs = 1
        model = TFBertForSequenceClassification.from_pretrained(
            "bert-base-uncased")
        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        data = tfds.load("glue/mrpc")
        train_dataset = glue_convert_examples_to_features(data["train"],
                                                          tokenizer,
                                                          max_length=128,
                                                          task="mrpc")
        train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        model.compile(optimizer=optimizer, loss=loss)
        model.fit(train_dataset, epochs=epochs, steps_per_epoch=10)

    hook = smd.get_hook()
    assert hook.has_default_hook_configuration()
    hook.close()
    # Check that hook created and tensors saved
    trial = smd.create_trial(path=sim.out_dir)
    assert len(trial.steps()) > 0, "Nothing saved at any step."
    assert len(trial.tensor_names()) > 0, "Tensors were not saved."

    # DEFAULT TENSORS SAVED
    assert len(trial.tensor_names(
        collection=CollectionKeys.LOSSES)) > 0, "No Losses Saved"
    assert len(trial.tensor_names(
        collection=CollectionKeys.METRICS)) > 0, "No Metrics Saved"
    assert (len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 0
            ), "Weights were not expected to be saved by default"
    assert (len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0
            ), "Biases were not expected to be saved by default"
Ejemplo n.º 15
0
def test_sagemaker():
    json_file_contents = """
{
    "S3OutputPath": "s3://sagemaker-test",
    "LocalPath": "/opt/ml/output/tensors",
    "HookParameters": null,
    "CollectionConfigurations": [
        {
            "CollectionName": "weights",
            "CollectionParameters": null
        },
        {
            "CollectionName": "losses",
            "CollectionParameters": null
        }
    ],
    "DebugHookSpecification": null
}
"""
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        smd.del_hook()
        hook = smd.get_hook(hook_type="session", create_if_not_exists=True)
        print(hook)
        assert "weights" in hook.include_collections, hook
def test_tensorflow2_with_unsupported_version(eager_mode: bool = True):
    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
    smd.del_hook()
    helper_keras_fit()
    hook = smd.get_hook()
    assert hook is None
def test_estimator_gradients_zcc(nested=False, mirrored=False):
    """ Works as intended. """
    smd.del_hook()
    tf.reset_default_graph()
    json_file_contents = """
        {
            "S3OutputPath": "s3://sagemaker-test",
            "LocalPath": "/opt/ml/output/tensors",
            "HookParameters" : {
                "save_interval": "2",
                "include_workers": "all"
            },
            "CollectionConfigurations": [
                {
                    "CollectionName": "gradients"
                },
                {
                    "CollectionName": "weights"
                },
                {
                    "CollectionName": "losses"
                },
                {
                    "CollectionName": "biases"
                }
            ]
        }
        """
    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:

        if mirrored:
            test_basic("/opt/ml/output/tensors", zcc=True)
        else:
            # Setup
            mnist_classifier = get_estimator(nested_optimizer=nested,
                                             mirrored=mirrored)
            train_input_fn, eval_input_fn = get_input_fns()

            # Train and evaluate
            train_steps, eval_steps = 10, 10
            mnist_classifier.train(input_fn=train_input_fn, steps=train_steps)
            mnist_classifier.evaluate(input_fn=eval_input_fn, steps=eval_steps)

            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            print(trial)
            assert smd.get_hook() is not None, "Hook was not created."
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert trial.steps() == [
                0,
                2,
                4,
                6,
                8,
                10,
                12,
                14,
                16,
                18,
            ], "Wrong step count for trial."
            print(trial.tensor_names(collection="gradients"))
            assert len(trial.tensor_names(collection="gradients")) > 0
            assert len(trial.tensor_names(collection="weights")) > 0
            assert len(trial.tensor_names(collection="losses")) > 0
            assert len(
                trial.tensor(
                    trial.tensor_names(collection="gradients")[0]).steps()) > 0
            assert len(trial.modes()) == 2
Ejemplo n.º 18
0
def helper_test_keras_v2_gradienttape(script_mode: bool = False,
                                      json_file_contents="{}",
                                      default=False):
    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
    smd.del_hook()
    tf.keras.backend.clear_session()

    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28,
                                                 1)),  # WA for TF issue #36279
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10, activation="softmax"),
        ])
        (x_train, y_train), _ = get_keras_data()
        dataset = tf.data.Dataset.from_tensor_slices(
            (tf.cast(x_train[..., tf.newaxis] / 255,
                     tf.float32), tf.cast(y_train, tf.int64)))
        dataset = dataset.shuffle(1000).batch(64)

        opt = tf.keras.optimizers.RMSprop()
        cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
        n_epochs = 1
        if script_mode:
            if json_file_contents == "{}":
                hook = smd.KerasHook(out_dir=sim.out_dir,
                                     export_tensorboard=True)
            else:
                hook = smd.KerasHook.create_from_json_file()

            for epoch in range(n_epochs):
                print("Epoch %d/%d" % (epoch + 1, n_epochs))
                for data, labels in dataset:
                    dataset_labels = labels
                    labels = tf.one_hot(labels, depth=10)
                    with hook.wrap_tape(tf.GradientTape()) as tape:
                        logits = model(data, training=True)  # (32,10)
                        loss_value = cce(labels, logits)
                    grads = tape.gradient(loss_value, model.variables)
                    opt.apply_gradients(zip(grads, model.variables))
                    acc = train_acc_metric(dataset_labels, logits)
                    hook.save_tensor(tensor_name="accuracy",
                                     tensor_value=acc,
                                     collections_to_write="metrics")
                log = "Epoch %d " % (epoch + 1)
                log += "Accuracy %.4f" % train_acc_metric.result()
                print(log)
                train_acc_metric.reset_states()
            hook = smd.get_hook()
            assert hook
            if default:
                assert hook.has_default_hook_configuration()
            hook.close()
            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert len(trial.tensor_names(collection="losses")) > 0
        else:
            # ZCC support added from smdebug v0.8.0)
            for epoch in range(n_epochs):
                print("Epoch %d/%d" % (epoch + 1, n_epochs))
                for data, labels in dataset:
                    dataset_labels = labels
                    labels = tf.one_hot(labels, depth=10)
                    with tf.GradientTape(persistent=True) as tape:
                        logits = model(data, training=True)  # (32,10)
                        loss_value = cce(labels, logits)
                    grads = tape.gradient(loss_value, model.variables)
                    opt.apply_gradients(zip(grads, model.variables))
                    acc = train_acc_metric(dataset_labels, logits)
                log = "Epoch %d " % (epoch + 1)
                log += "Accuracy %.4f" % train_acc_metric.result()
                print(log)
                train_acc_metric.reset_states()
            hook = smd.get_hook()
            if not (is_tf_2_2() or is_tf_2_3()):
                assert not hook  # only supported on TF 2.2 and greater
                return
            assert hook
            hook.close()
            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert len(trial.tensor_names(collection="losses")) > 0
            if is_tf_2_2() and default is False:
                # Inputs and Outputs are not saved with the default collection configurations.
                assert len(trial.tensor_names(collection="inputs")) > 0
                assert len(trial.tensor_names(collection="outputs")) > 0
                assert trial.tensor_names(collection="outputs") == [
                    "predictions"
                ]
                if "dense_layers" in json_file_contents:
                    # Only assert for test_keras_v2_multi_collections
                    # which defines this custom collection
                    assert len(
                        trial.tensor_names(collection="dense_layers")) > 0
                else:
                    assert len(
                        trial.tensor_names(collection="dense_layers")) == 0
    def cnn_model_fn(features, labels, mode):
        """Model function for CNN."""
        # Input Layer
        input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])

        # Convolutional Layer #1
        conv1 = tf.layers.conv2d(
            inputs=input_layer,
            filters=32,
            kernel_size=[5, 5],
            padding="same",
            activation=tf.nn.relu,
        )

        # Pooling Layer #1
        pool1 = tf.layers.max_pooling2d(inputs=conv1,
                                        pool_size=[2, 2],
                                        strides=2)

        # Convolutional Layer #2 and Pooling Layer #2
        conv2 = tf.layers.conv2d(inputs=pool1,
                                 filters=64,
                                 kernel_size=[5, 5],
                                 padding="same",
                                 activation=tf.nn.relu)
        pool2 = tf.layers.max_pooling2d(inputs=conv2,
                                        pool_size=[2, 2],
                                        strides=2)

        # Dense Layer
        pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])
        dense = tf.layers.dense(inputs=pool2_flat,
                                units=1024,
                                activation=tf.nn.relu)
        dropout = tf.layers.dropout(
            inputs=dense,
            rate=0.4,
            training=mode == tf.estimator.ModeKeys.TRAIN)

        # Logits Layer
        logits = tf.layers.dense(inputs=dropout, units=10)

        predictions = {
            # Generate predictions (for PREDICT and EVAL mode)
            "classes": tf.argmax(input=logits, axis=1),
            # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
            # `logging_hook`.
            "probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
        }

        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(mode=mode,
                                              predictions=predictions)

        # Calculate Loss (for both TRAIN and EVAL modes)
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
                                                      logits=logits)

        # Configure the Training Op (for TRAIN mode)
        if mode == tf.estimator.ModeKeys.TRAIN:
            optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
            optimizer = smd.get_hook().wrap_optimizer(optimizer)
            train_op = optimizer.minimize(
                loss=loss, global_step=tf.train.get_global_step())
            return tf.estimator.EstimatorSpec(mode=mode,
                                              loss=loss,
                                              train_op=train_op)

        # Add evaluation metrics (for EVAL mode)
        eval_metric_ops = {
            "accuracy":
            tf.metrics.accuracy(labels=labels,
                                predictions=predictions["classes"])
        }
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          eval_metric_ops=eval_metric_ops)
Ejemplo n.º 20
0
def helper_mirrored(
    trial_dir,
    save_all=False,
    num_steps=3,
    save_config=None,
    reduction_config=None,
    include_collections=None,
    steps=None,
    zcc=False,
    eval_distributed=False,
    include_workers="all",
):
    num_gpus = get_available_gpus()
    num_devices = num_gpus if num_gpus > 0 else 1
    batch_size = 10 * num_devices

    # input_fn which serves Dataset
    input_fn_provider = InputFnProvider(
        per_device_batch_size(batch_size, num_devices))

    # Use multiple GPUs by MirroredStragtegy.
    # All avaiable GPUs will be used if `num_gpus` is omitted.
    # if num_devices > 1:
    distribution = tf.contrib.distribute.MirroredStrategy()
    # print("### Doing Multi GPU Training")
    # else:
    #     distribution = None
    # Pass to RunConfig
    config = tf.estimator.RunConfig(
        train_distribute=distribution,
        eval_distribute=distribution if eval_distributed else None,
        model_dir="/tmp/mnist_convnet_model",
    )

    if save_config is None:
        save_config = smd.SaveConfig(save_interval=2)

    if include_collections is None:
        include_collections = [
            CollectionKeys.WEIGHTS,
            CollectionKeys.BIASES,
            CollectionKeys.GRADIENTS,
            CollectionKeys.LOSSES,
        ]

    if not zcc:
        ts_hook = smd.SessionHook(
            out_dir=trial_dir,
            save_all=save_all,
            include_collections=include_collections,
            save_config=save_config,
            reduction_config=reduction_config,
            include_workers=include_workers,
        )
    else:
        print("zcc is passed. ignoring include_collections and save_config")

    mnist_classifier = tf.estimator.Estimator(model_fn=cnn_model_fn,
                                              config=config)
    if steps is None:
        steps = ["train"]

    for s in steps:
        if s == "train":
            print("Starting train")
            if not zcc:
                ts_hook.set_mode(smd.modes.TRAIN)
                # Train the model
                mnist_classifier.train(
                    input_fn=input_fn_provider.train_input_fn,
                    steps=num_steps,
                    hooks=[ts_hook])
            else:
                mnist_classifier.train(
                    input_fn=input_fn_provider.train_input_fn, steps=num_steps)
        elif s == "eval":
            print("Starting eval")

            if not zcc:
                ts_hook.set_mode(smd.modes.EVAL)
                # Evaluate the model and print results
                mnist_classifier.evaluate(
                    input_fn=input_fn_provider.eval_input_fn,
                    steps=num_steps,
                    hooks=[ts_hook])
            else:
                mnist_classifier.evaluate(
                    input_fn=input_fn_provider.eval_input_fn, steps=num_steps)
        elif s == "predict":
            print("Starting predict")
            if not zcc:
                ts_hook.set_mode(smd.modes.PREDICT)
                # Evaluate the model and print results
                p = mnist_classifier.predict(
                    input_fn=input_fn_provider.eval_input_fn, hooks=[ts_hook])
            else:
                p = mnist_classifier.predict(
                    input_fn=input_fn_provider.eval_input_fn)
            for i in range(num_steps):
                next(p)
    get_hook()._cleanup()
    return distribution
def helper_test_keras_v2_json_config(json_file_contents,
                                     script_mode: bool = False,
                                     eager_mode: bool = True,
                                     custom_classifier=False):
    """ Tests ZCC with custom hook configs """
    smd.del_hook()
    tf.keras.backend.clear_session()
    if not eager_mode and is_tf_2_3() is False and is_tf_2_2() is False:
        # v1 training APIs are currently not supported
        # in ZCC mode with smdebug 0.9 and AWS TF 2.3.0
        tf.compat.v1.disable_eager_execution()
    run_eagerly = None
    if is_tf_2_2() or is_tf_2_3():
        run_eagerly = eager_mode
    enable_tb = False if (tf.__version__ == "2.0.2" or is_tf_2_3()) else True
    with SagemakerSimulator(json_file_contents=json_file_contents,
                            enable_tb=enable_tb) as sim:
        if custom_classifier:
            model = CustomClassifierModel([
                tf.keras.layers.Flatten(input_shape=(28, 28)),
                tf.keras.layers.Dense(128, activation="relu"),
                tf.keras.layers.Dropout(0.2),
                tf.keras.layers.Dense(10, activation="softmax"),
            ])
        else:
            model = get_keras_model_v2()
        (x_train, y_train), (x_test, y_test) = get_keras_data()
        x_train, x_test = x_train / 255, x_test / 255

        opt = tf.keras.optimizers.RMSprop()
        if script_mode:
            hook = smd.KerasHook.create_from_json_file()
            opt = hook.wrap_optimizer(opt)
            model.compile(
                loss="sparse_categorical_crossentropy",
                optimizer=opt,
                metrics=["accuracy"],
                run_eagerly=run_eagerly,
            )
            history = model.fit(x_train,
                                y_train,
                                batch_size=64,
                                epochs=2,
                                validation_split=0.2,
                                callbacks=[hook])
            test_scores = model.evaluate(x_test,
                                         y_test,
                                         verbose=2,
                                         callbacks=[hook])
        else:
            model.compile(
                loss="sparse_categorical_crossentropy",
                optimizer=opt,
                metrics=["accuracy"],
                run_eagerly=run_eagerly,
            )
            history = model.fit(x_train,
                                y_train,
                                epochs=2,
                                batch_size=64,
                                validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        hook = smd.get_hook()
        assert hook
        hook.close()
        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."
        if not eager_mode and is_tf_2_2():
            assert len(trial.tensor_names(collection="gradients")) > 0
        assert len(trial.tensor_names(collection="weights")) > 0
        assert len(trial.tensor_names(collection="losses")) > 0
        if is_tf_2_2():
            assert len(trial.tensor_names(collection="inputs")) > 0
            assert len(trial.tensor_names(collection="outputs")) > 0
            if "dense_layers" in json_file_contents:
                # Only assert for test_keras_v2_multi_collections
                # which defines this custom collection
                assert len(trial.tensor_names(collection="dense_layers")) > 0
            else:
                assert len(trial.tensor_names(collection="dense_layers")) == 0
def train_model(
    trial_dir,
    save_all=False,
    hook=None,
    include_collections=None,
    reduction_config=None,
    save_config=None,
    eager=True,
    strategy=None,
    steps=None,
    add_callbacks=None,
    include_workers="all",
):
    tf.keras.backend.clear_session()
    if not eager:
        tf.compat.v1.disable_eager_execution()

    datasets, info = tfds.load(name="mnist",
                               with_info=True,
                               as_supervised=True)

    mnist_train, mnist_test = datasets["train"], datasets["test"]

    if strategy is None:
        strategy = tf.distribute.MirroredStrategy()

    # You can also do info.splits.total_num_examples to get the total
    # number of examples in the dataset.

    BUFFER_SIZE = 10000

    BATCH_SIZE_PER_REPLICA = 64
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

    def scale(image, label):
        image = tf.cast(image, tf.float32)
        image /= 255

        return image, label

    train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(
        BATCH_SIZE)
    eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)

    if hook is None:
        if save_config is None:
            save_config = SaveConfig(save_interval=3)

        hook = KerasHook(
            out_dir=trial_dir,
            save_config=save_config,
            reduction_config=reduction_config,
            include_collections=include_collections,
            save_all=save_all,
            include_workers=include_workers,
        )

        if not save_all and include_collections is not None:
            for cname in hook.include_collections:
                if cname not in include_collections:
                    hook.get_collection(cname).save_config = SaveConfig(
                        end_step=0)

    opt = tf.keras.optimizers.Adam()

    opt = hook.wrap_optimizer(opt)

    with strategy.scope():
        relu_layer = tf.keras.layers.Dense(64, activation="relu")
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32,
                                   3,
                                   activation="relu",
                                   input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            relu_layer,
            tf.keras.layers.Dense(10, activation="softmax"),
        ])
        model.compile(loss="sparse_categorical_crossentropy",
                      optimizer=opt,
                      metrics=["accuracy"])

    hooks = []
    if add_callbacks:
        if "tensorboard" in add_callbacks:
            hooks.append(
                # write_grads = True causes crash saying handle must be created in scope
                # erorr like this https://stackoverflow.com/questions/56836895/custom-training-loop-using-tensorflow-gpu-1-14-and-tf-distribute-mirroredstrateg
                # this crash is even if callback is off
                tf.keras.callbacks.TensorBoard(log_dir="/tmp/logs",
                                               histogram_freq=4,
                                               write_images=True))

    hooks.append(hook)
    scalars_to_be_saved = dict()
    ts = time.time()
    scalars_to_be_saved["scalar/foobar"] = (ts, steps)
    hook.save_scalar("foobar", 1, sm_metric=True, timestamp=ts)

    if steps is None:
        steps = ["train"]
    for step in steps:
        if step == "train":
            model.fit(train_dataset,
                      epochs=1,
                      steps_per_epoch=10,
                      callbacks=hooks,
                      verbose=0)
        elif step == "eval":
            model.evaluate(eval_dataset, steps=10, callbacks=hooks, verbose=0)
        elif step == "predict":
            model.predict(train_dataset, steps=4, callbacks=hooks, verbose=0)

    smd.get_hook().close()
    return strategy, scalars_to_be_saved
def helper_test_keras_v2(script_mode: bool = False, eager_mode: bool = True):
    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
    smd.del_hook()
    tf.keras.backend.clear_session()
    if not eager_mode and is_tf_2_3() is False and is_tf_2_2() is False:
        # v1 training APIs are currently not supported
        # in ZCC mode with smdebug 0.9 and AWS TF 2.3.0
        tf.compat.v1.disable_eager_execution()
    enable_tb = False if (tf.__version__ == "2.0.2" or is_tf_2_3()) else True
    run_eagerly = None
    if is_tf_2_2() or is_tf_2_3():
        run_eagerly = eager_mode
    with SagemakerSimulator(enable_tb=enable_tb) as sim:
        model = get_keras_model_v2()
        (x_train, y_train), (x_test, y_test) = get_keras_data()
        x_train, x_test = x_train / 255, x_test / 255

        opt = tf.keras.optimizers.RMSprop()
        if script_mode:
            hook = smd.KerasHook(out_dir=sim.out_dir, export_tensorboard=True)
            opt = hook.wrap_optimizer(opt)
            model.compile(
                loss="sparse_categorical_crossentropy",
                optimizer=opt,
                metrics=["accuracy"],
                run_eagerly=run_eagerly,
            )
            history = model.fit(
                x_train, y_train, batch_size=64, epochs=1, validation_split=0.2, callbacks=[hook]
            )
            test_scores = model.evaluate(x_test, y_test, verbose=2, callbacks=[hook])
        else:
            model.compile(
                loss="sparse_categorical_crossentropy",
                optimizer=opt,
                metrics=["accuracy"],
                run_eagerly=run_eagerly,
            )
            history = model.fit(x_train, y_train, batch_size=64, epochs=1, validation_split=0.2)
            test_scores = model.evaluate(x_test, y_test, verbose=2)

        hook = smd.get_hook()
        assert hook
        # Check if the hook was executed with the default
        # hook configuration
        assert hook.has_default_hook_configuration()
        hook.close()
        # Check that hook created and tensors saved
        trial = smd.create_trial(path=sim.out_dir)
        assert len(trial.steps()) > 0, "Nothing saved at any step."
        assert len(trial.tensor_names()) > 0, "Tensors were not saved."

        # DEFAULT TENSORS SAVED
        assert len(trial.tensor_names(collection=CollectionKeys.LOSSES)) > 0, "No Losses Saved"
        assert len(trial.tensor_names(collection=CollectionKeys.METRICS)) > 0, "No Metrics Saved"
        assert (
            len(trial.tensor_names(collection=CollectionKeys.WEIGHTS)) == 0
        ), "Weights were not expected to be saved by default"
        assert (
            len(trial.tensor_names(collection=CollectionKeys.BIASES)) == 0
        ), "Biases were not expected to be saved by default"
Ejemplo n.º 24
0
def cnn_model_fn(features, labels, mode):
    """Model function for CNN."""
    # Input Layer
    # Reshape X to 4-D tensor: [batch_size, width, height, channels]
    # MNIST images are 28x28 pixels, and have one color channel
    input_layer = tf.reshape(features["x"], [-1, 28, 28, 1])

    # Convolutional Layer #1
    # Computes 32 features using a 5x5 filter with ReLU activation.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 28, 28, 1]
    # Output Tensor Shape: [batch_size, 28, 28, 32]
    conv1 = tf.layers.conv2d(inputs=input_layer,
                             filters=32,
                             kernel_size=[5, 5],
                             padding="same",
                             activation=tf.nn.relu)

    # Pooling Layer #1
    # First max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 28, 28, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 32]
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

    # Convolutional Layer #2
    # Computes 64 features using a 5x5 filter.
    # Padding is added to preserve width and height.
    # Input Tensor Shape: [batch_size, 14, 14, 32]
    # Output Tensor Shape: [batch_size, 14, 14, 64]
    conv2 = tf.layers.conv2d(inputs=pool1,
                             filters=64,
                             kernel_size=[5, 5],
                             padding="same",
                             activation=tf.nn.relu)

    # Pooling Layer #2
    # Second max pooling layer with a 2x2 filter and stride of 2
    # Input Tensor Shape: [batch_size, 14, 14, 64]
    # Output Tensor Shape: [batch_size, 7, 7, 64]
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

    # Flatten tensor into a batch of vectors
    # Input Tensor Shape: [batch_size, 7, 7, 64]
    # Output Tensor Shape: [batch_size, 7 * 7 * 64]
    pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64])

    # Dense Layer
    # Densely connected layer with 1024 neurons
    # Input Tensor Shape: [batch_size, 7 * 7 * 64]
    # Output Tensor Shape: [batch_size, 1024]
    dense = tf.layers.dense(inputs=pool2_flat,
                            units=1024,
                            activation=tf.nn.relu)

    # Add dropout operation; 0.6 probability that element will be kept
    dropout = tf.layers.dropout(inputs=dense,
                                rate=0.4,
                                training=mode == tf.estimator.ModeKeys.TRAIN)

    # Logits layer
    # Input Tensor Shape: [batch_size, 1024]
    # Output Tensor Shape: [batch_size, 10]
    logits = tf.layers.dense(inputs=dropout, units=10)

    predictions = {
        # Generate predictions (for PREDICT and EVAL mode)
        "classes": tf.argmax(input=logits, axis=1),
        # Add `softmax_tensor` to the graph. It is used for PREDICT and by the
        # `logging_hook`.
        "probabilities": tf.nn.softmax(logits, name="softmax_tensor"),
    }
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Calculate Loss (for both TRAIN and EVAL modes)
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # Configure the Training Op (for TRAIN mode)
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
        optimizer = smd.get_hook().wrap_optimizer(optimizer)
        train_op = optimizer.minimize(loss=loss,
                                      global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op)

    # Add evaluation metrics (for EVAL mode)
    eval_metric_ops = {
        "accuracy":
        tf.metrics.accuracy(labels=labels, predictions=predictions["classes"])
    }
    return tf.estimator.EstimatorSpec(mode=mode,
                                      loss=loss,
                                      eval_metric_ops=eval_metric_ops)
def helper_test_keras_v2_gradienttape(script_mode: bool = False,
                                      json_file_contents="{}"):
    """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes."""
    smd.del_hook()
    tf.keras.backend.clear_session()

    with SagemakerSimulator(json_file_contents=json_file_contents) as sim:
        model = tf.keras.models.Sequential([
            tf.keras.layers.Flatten(input_shape=(28, 28,
                                                 1)),  # WA for TF issue #36279
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Dense(10, activation="softmax"),
        ])
        (x_train, y_train), _ = get_keras_data()
        dataset = tf.data.Dataset.from_tensor_slices(
            (tf.cast(x_train[..., tf.newaxis] / 255,
                     tf.float32), tf.cast(y_train, tf.int64)))
        dataset = dataset.shuffle(1000).batch(64)

        opt = tf.keras.optimizers.RMSprop()
        cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
        train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
        n_epochs = 2
        if script_mode:
            if json_file_contents == "{}":
                hook = smd.KerasHook(out_dir=sim.out_dir,
                                     export_tensorboard=True)
            else:
                hook = smd.KerasHook.create_from_json_file()

            for epoch in range(n_epochs):
                print("Epoch %d/%d" % (epoch + 1, n_epochs))
                for data, labels in dataset:
                    dataset_labels = labels
                    labels = tf.one_hot(labels, depth=10)
                    with hook.wrap_tape(tf.GradientTape()) as tape:
                        logits = model(data, training=True)  # (32,10)
                        loss_value = cce(labels, logits)
                    grads = tape.gradient(loss_value, model.variables)
                    opt.apply_gradients(zip(grads, model.variables))
                    acc = train_acc_metric(dataset_labels, logits)
                    hook.record_tensor_value(tensor_name="accuracy",
                                             tensor_value=acc)
                log = "Epoch %d " % (epoch + 1)
                log += "Accuracy %.4f" % train_acc_metric.result()
                print(log)
                train_acc_metric.reset_states()
            hook = smd.get_hook()
            assert hook
            hook.close()
            # Check that hook created and tensors saved
            trial = smd.create_trial(path=sim.out_dir)
            assert len(trial.steps()) > 0, "Nothing saved at any step."
            assert len(trial.tensor_names()) > 0, "Tensors were not saved."
            assert len(trial.tensor_names(collection="losses")) > 0
        else:
            # ZCC doesn't support yet (as of smdebug v0.7.2)
            for epoch in range(n_epochs):
                print("Epoch %d/%d" % (epoch + 1, n_epochs))
                for data, labels in dataset:
                    dataset_labels = labels
                    labels = tf.one_hot(labels, depth=10)
                    with tf.GradientTape(persistent=True) as tape:
                        logits = model(data, training=True)  # (32,10)
                        loss_value = cce(labels, logits)
                    grads = tape.gradient(loss_value, model.variables)
                    opt.apply_gradients(zip(grads, model.variables))
                    acc = train_acc_metric(dataset_labels, logits)
                log = "Epoch %d " % (epoch + 1)
                log += "Accuracy %.4f" % train_acc_metric.result()
                print(log)
                train_acc_metric.reset_states()
            hook = smd.get_hook()
            assert not hook