def test_include_regex(out_dir, tf_eager_mode, workers):
    hook = KerasHook(
        out_dir=out_dir,
        save_config=SaveConfig(save_interval=9),
        include_collections=["custom_coll"],
        include_workers=workers,
    )
    hook.get_collection("custom_coll").include("dense")
    strategy, _ = train_model(out_dir,
                              hook=hook,
                              steps=["train"],
                              eager=tf_eager_mode)

    tr = create_trial_fast_refresh(out_dir)
    tnames = tr.tensor_names(collection="custom_coll")

    if tf_eager_mode:
        assert len(tnames) == (12 if is_tf_2_2() else 4)
    else:
        assert len(tnames) == 4 + 3 * strategy.num_replicas_in_sync
    for tname in tnames:
        assert tr.tensor(tname).value(0) is not None
        assert len(
            tr.tensor(tname).workers(0)) == (1 if workers == "one" else
                                             strategy.num_replicas_in_sync)
def test_include_regex_opt_var(out_dir, tf_eager_mode, workers):
    include_collections = ["custom_optimizer_variables"]
    save_config = SaveConfig(save_interval=3)
    hook = KerasHook(
        out_dir=out_dir,
        save_config=save_config,
        include_collections=include_collections,
        include_workers=workers,
    )
    hook.get_collection("custom_optimizer_variables").include("Adam")
    strategy, _ = train_model(out_dir,
                              hook=hook,
                              steps=["train"],
                              eager=tf_eager_mode)

    tr = create_trial_fast_refresh(out_dir)
    tnames = tr.tensor_names(collection="custom_optimizer_variables")

    if tf_eager_mode:
        assert len(tnames) == 5
    else:
        assert len(tnames) == 4 + 3 * strategy.num_replicas_in_sync
    for tname in tnames:
        assert tr.tensor(tname).value(0) is not None
        assert len(
            tr.tensor(tname).workers(0)) == (1 if workers == "one" else
                                             strategy.num_replicas_in_sync)
def test_collection_reductions(out_dir, tf_eager_mode):
    tf.keras.backend.clear_session()
    hook = KerasHook(
        out_dir=out_dir,
        save_config=SaveConfig(save_interval=3),
        include_collections=[
            CollectionKeys.WEIGHTS,
            CollectionKeys.BIASES,
            CollectionKeys.GRADIENTS,
        ],
    )
    hook.get_collection(
        CollectionKeys.WEIGHTS).reduction_config = ReductionConfig(
            norms=["l1"])
    train_model(out_dir, hook=hook, steps=["train"], eager=tf_eager_mode)

    tr = create_trial_fast_refresh(out_dir)
    weight_name = tr.tensor_names(collection=CollectionKeys.WEIGHTS)[0]

    try:
        tr.tensor(weight_name).value(0)
        assert False
    except TensorUnavailableForStep:
        try:
            assert tr.tensor(weight_name).reduction_value(0, "l1") is not None
        except ValueError:
            # some tensors reduction can't be computed
            pass
    except TensorUnavailable:
        # sometimes we might not have tensor saved if it was only being
        # saved as reduction and the reduction computation failed
        pass
Example #4
0
def test_include_regex(out_dir):
    hook = KerasHook(
        out_dir, save_config=SaveConfig(save_interval=9), include_collections=["custom_coll"]
    )
    hook.get_collection("custom_coll").include("dense")
    train_model(out_dir, hook=hook, save_config=SaveConfig(save_interval=9), steps=["train"])

    tr = create_trial_fast_refresh(out_dir)
    tnames = tr.tensor_names(collection="custom_coll")

    assert len(tnames) == 12
    for tname in tnames:
        assert tr.tensor(tname).value(0) is not None
Example #5
0
def test_include_regex(out_dir):
    hook = KerasHook(
        out_dir=out_dir,
        save_config=SaveConfig(save_interval=9),
        include_collections=["custom_coll"],
        include_workers="all",
    )
    hook.get_collection("custom_coll").include("dense")
    strategy = train_model(out_dir, hook=hook, steps=["train"])

    tr = create_trial_fast_refresh(out_dir)
    tnames = tr.tensor_names(collection="custom_coll")

    assert len(tnames) == 4 + 4 + 3 * strategy.num_replicas_in_sync
    for tname in tnames:
        assert tr.tensor(tname).value(0) is not None
Example #6
0
def test_collection_reductions(out_dir):
    hook = KerasHook(
        out_dir,
        save_config=SaveConfig(save_interval=3),
        include_collections=[CollectionKeys.WEIGHTS, CollectionKeys.GRADIENTS],
    )
    hook.get_collection(CollectionKeys.GRADIENTS).reduction_config = ReductionConfig(norms=["l1"])
    train_model(out_dir, hook=hook, steps=["train"])

    tr = create_trial_fast_refresh(out_dir)
    weight_name = tr.tensor_names(collection=CollectionKeys.WEIGHTS)[0]
    grad_name = tr.tensor_names(collection=CollectionKeys.GRADIENTS)[0]

    assert tr.tensor(weight_name).value(0) is not None
    try:
        tr.tensor(grad_name).value(0)
        assert False
    except TensorUnavailableForStep:
        assert tr.tensor(weight_name).reduction_value(0, "l1") is not None
def train_model(
    trial_dir,
    save_all=False,
    include_collections=None,
    reduction_config=None,
    save_config=None,
    use_tf_keras=True,
    hook=None,
    eager=False,
    use_keras_optimizer=True,
    create_relu_collection=False,
    steps=None,
    add_callbacks=None,
):
    if use_tf_keras:
        from tensorflow import keras
    else:
        import keras

    # if reset:
    #     tf.reset_default_graph()

    mnist = keras.datasets.mnist

    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    x_train, x_test = x_train / 255.0, x_test / 255.0

    relu_layer = keras.layers.Dense(128, activation="relu")

    model = keras.models.Sequential([
        keras.layers.Flatten(input_shape=(28, 28)),
        relu_layer,
        keras.layers.Dropout(0.2),
        keras.layers.Dense(10, activation="softmax"),
    ])

    if hook is None:
        if save_config is None:
            save_config = SaveConfig(save_interval=3)

        hook = KerasHook(
            trial_dir,
            save_config=save_config,
            save_all=save_all,
            include_collections=include_collections,
            reduction_config=reduction_config,
        )

        if not save_all and include_collections is not None:
            for cname in hook.include_collections:
                if cname not in include_collections:
                    hook.get_collection(cname).save_config = SaveConfig(
                        end_step=0)

    if create_relu_collection:
        hook.get_collection("relu").add_keras_layer(relu_layer,
                                                    inputs=True,
                                                    outputs=True)

    if use_keras_optimizer:
        opt = keras.optimizers.RMSprop()
    else:
        opt = tf.train.RMSPropOptimizer(0.1)

    opt = hook.wrap_optimizer(opt)

    if use_tf_keras:
        model.compile(
            optimizer=opt,
            loss="sparse_categorical_crossentropy",
            run_eagerly=eager,
            metrics=["accuracy"],
        )
    else:
        model.compile(optimizer=opt,
                      loss="sparse_categorical_crossentropy",
                      metrics=["accuracy"])

    hooks = []
    if add_callbacks:
        if "tensorboard" in add_callbacks:
            hooks.append(
                tf.keras.callbacks.TensorBoard(log_dir="/tmp/logs",
                                               histogram_freq=1,
                                               write_grads=True,
                                               write_images=True))
        if "fetch_tensor" in add_callbacks:
            hooks.append(FetchTensorCallback(model.outputs + model.weights))
    hooks.append(hook)

    if steps is None:
        steps = ["train"]
    for step in steps:
        if step == "train":
            model.fit(x_train,
                      y_train,
                      epochs=1,
                      steps_per_epoch=10,
                      callbacks=hooks,
                      verbose=0)
        elif step == "eval":
            model.evaluate(x_test,
                           y_test,
                           steps=10,
                           callbacks=hooks,
                           verbose=0)
        elif step == "predict":
            model.predict(x_test[:100], callbacks=hooks, verbose=0)

    hook._cleanup()
def train_model(
    trial_dir,
    save_all=False,
    hook=None,
    include_collections=None,
    reduction_config=None,
    save_config=None,
    eager=True,
    strategy=None,
    steps=None,
    add_callbacks=None,
    include_workers="all",
):
    tf.keras.backend.clear_session()
    if not eager:
        tf.compat.v1.disable_eager_execution()

    datasets, info = tfds.load(name="mnist",
                               with_info=True,
                               as_supervised=True)

    mnist_train, mnist_test = datasets["train"], datasets["test"]

    if strategy is None:
        strategy = tf.distribute.MirroredStrategy()

    # You can also do info.splits.total_num_examples to get the total
    # number of examples in the dataset.

    BUFFER_SIZE = 10000

    BATCH_SIZE_PER_REPLICA = 64
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync

    def scale(image, label):
        image = tf.cast(image, tf.float32)
        image /= 255

        return image, label

    train_dataset = mnist_train.map(scale).cache().shuffle(BUFFER_SIZE).batch(
        BATCH_SIZE)
    eval_dataset = mnist_test.map(scale).batch(BATCH_SIZE)

    if hook is None:
        if save_config is None:
            save_config = SaveConfig(save_interval=3)

        hook = KerasHook(
            out_dir=trial_dir,
            save_config=save_config,
            reduction_config=reduction_config,
            include_collections=include_collections,
            save_all=save_all,
            include_workers=include_workers,
        )

        if not save_all and include_collections is not None:
            for cname in hook.include_collections:
                if cname not in include_collections:
                    hook.get_collection(cname).save_config = SaveConfig(
                        end_step=0)

    opt = tf.keras.optimizers.Adam()

    opt = hook.wrap_optimizer(opt)

    with strategy.scope():
        relu_layer = tf.keras.layers.Dense(64, activation="relu")
        model = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32,
                                   3,
                                   activation="relu",
                                   input_shape=(28, 28, 1)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            relu_layer,
            tf.keras.layers.Dense(10, activation="softmax"),
        ])
        model.compile(loss="sparse_categorical_crossentropy",
                      optimizer=opt,
                      metrics=["accuracy"])

    hooks = []
    if add_callbacks:
        if "tensorboard" in add_callbacks:
            hooks.append(
                # write_grads = True causes crash saying handle must be created in scope
                # erorr like this https://stackoverflow.com/questions/56836895/custom-training-loop-using-tensorflow-gpu-1-14-and-tf-distribute-mirroredstrateg
                # this crash is even if callback is off
                tf.keras.callbacks.TensorBoard(log_dir="/tmp/logs",
                                               histogram_freq=4,
                                               write_images=True))

    hooks.append(hook)
    scalars_to_be_saved = dict()
    ts = time.time()
    scalars_to_be_saved["scalar/foobar"] = (ts, steps)
    hook.save_scalar("foobar", 1, sm_metric=True, timestamp=ts)

    if steps is None:
        steps = ["train"]
    for step in steps:
        if step == "train":
            model.fit(train_dataset,
                      epochs=1,
                      steps_per_epoch=10,
                      callbacks=hooks,
                      verbose=0)
        elif step == "eval":
            model.evaluate(eval_dataset, steps=10, callbacks=hooks, verbose=0)
        elif step == "predict":
            model.predict(train_dataset, steps=4, callbacks=hooks, verbose=0)

    smd.get_hook().close()
    return strategy, scalars_to_be_saved