def test_lstm_and_generator(out_dir):
    # init hook
    hook = KerasHook(
        out_dir,
        include_collections=[
            CollectionKeys.WEIGHTS,
            CollectionKeys.LOSSES,
            CollectionKeys.GRADIENTS,
        ],
        save_config=SaveConfig(save_steps=[0, 1, 2, 3]),
    )

    # init model
    num_steps = 100
    hidden_size = 100
    vocabulary = 1000
    model = Sequential()
    model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
    model.add(LSTM(hidden_size, return_sequences=True))
    model.add(LSTM(hidden_size, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(TimeDistributed(Dense(vocabulary)))
    model.add(Activation("softmax"))

    model.compile(
        loss="categorical_crossentropy",
        optimizer=hook.wrap_optimizer(Adam()),
        metrics=["categorical_accuracy"],
    )

    train(3, 32, model, num_steps, hook)

    tr = create_trial(out_dir)
    assert len(tr.tensor_names(collection=CollectionKeys.LOSSES)) > 0
    assert len(tr.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0
def test_tf2_profiler_by_time(tf2_profiler_config_parser_by_time, out_dir):
    """
    This test executes a TF2 training script, enables detailed TF profiling by time, and
    verifies the number of events.
    """
    assert tf2_profiler_config_parser_by_time.profiling_enabled

    hook = Hook(out_dir=out_dir)
    helper_keras_fit(trial_dir=out_dir,
                     hook=hook,
                     eager=True,
                     steps=["train", "eval", "predict"])
    hook.close()

    # get tensorboard timeline files
    files = []
    for path in Path(tf2_profiler_config_parser_by_time.config.local_path +
                     "/framework").rglob(f"*{TENSORBOARDTIMELINE_SUFFIX}"):
        files.append(path)

    assert len(files) == 1

    trace_file = str(files[0])
    t_events = TensorboardProfilerEvents()

    t_events.read_events_from_file(trace_file)

    all_trace_events = t_events.get_all_events()
    num_trace_events = len(all_trace_events)

    print(f"Number of events read = {num_trace_events}")

    # The number of events is varying by a small number on
    # consecutive runs. Hence, the approximation in the below asserts.
    assert num_trace_events >= 700
def helper_tensorflow_tests(use_keras, collection, save_config,
                            with_timestamp):
    coll_name, coll_regex = collection

    run_id = "trial_" + coll_name + "-" + datetime.now().strftime(
        "%Y%m%d-%H%M%S%f")
    trial_dir = os.path.join(SMDEBUG_TF_HOOK_TESTS_DIR, run_id)

    if use_keras:
        hook = TF_KerasHook(
            out_dir=trial_dir,
            include_collections=[coll_name],
            save_config=save_config,
            export_tensorboard=True,
        )

        saved_scalars = simple_tf_model(hook, with_timestamp=with_timestamp)

    else:
        hook = TF_SessionHook(
            out_dir=trial_dir,
            include_collections=[coll_name],
            save_config=save_config,
            export_tensorboard=True,
        )

        saved_scalars = tf_session_model(hook, with_timestamp=with_timestamp)
        tf.reset_default_graph()

    hook.close()
    verify_files(trial_dir, save_config, saved_scalars)
    if with_timestamp:
        check_tf_events(trial_dir, saved_scalars)
def test_tf2_profiler_by_time(tf2_profiler_config_parser_by_time, out_dir):
    """
    This test executes a TF2 training script, enables detailed TF profiling by time, and
    verifies the number of events.
    """
    assert tf2_profiler_config_parser_by_time.profiling_enabled

    hook = Hook(out_dir=out_dir)
    helper_keras_fit(trial_dir=out_dir,
                     hook=hook,
                     eager=True,
                     steps=["train", "eval", "predict"])
    hook.close()

    verify_detailed_profiling(out_dir, 700)
def main():
    _ = KerasHook(
        out_dir=""
    )  # need this line so that import doesn't get removed by pre-commit
    parser = argparse.ArgumentParser(description="Train resnet50 cifar10")
    parser.add_argument("--batch_size", type=int, default=256)
    parser.add_argument("--epoch", type=int, default=5)
    parser.add_argument("--data_augmentation", type=bool, default=False)
    parser.add_argument("--model_dir",
                        type=str,
                        default="./model_keras_resnet")
    parser.add_argument("--enable_bottleneck", type=bool, default=True)
    args = parser.parse_args()

    mirrored_strategy = tf.distribute.MirroredStrategy()

    with mirrored_strategy.scope():
        model = ResNet50(weights=None, input_shape=(32, 32, 3), classes=10)
        opt = tf.keras.optimizers.Adam(learning_rate=0.001)
        model.compile(loss="categorical_crossentropy",
                      optimizer=opt,
                      metrics=["accuracy"])

    # start the training.
    train(args.batch_size, args.epoch, model, args.enable_bottleneck,
          args.data_augmentation)
def helper_tensorflow_tests(use_keras, collection, save_config):
    coll_name, coll_regex = collection

    run_id = "trial_" + coll_name + "-" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    trial_dir = os.path.join(SMDEBUG_TF_HOOK_TESTS_DIR, run_id)

    if use_keras:
        hook = TF_KerasHook(
            out_dir=trial_dir,
            include_collections=[coll_name],
            save_config=save_config,
            export_tensorboard=True,
        )

        simple_tf_model(hook)

        saved_scalars = [
            "scalar/tf_keras_num_steps",
            "scalar/tf_keras_before_train",
            "scalar/tf_keras_after_train",
        ]
    else:
        hook = TF_SessionHook(
            out_dir=trial_dir,
            include_collections=[coll_name],
            save_config=save_config,
            export_tensorboard=True,
        )

        tf_session_model(hook)
        tf.reset_default_graph()

        saved_scalars = [
            "scalar/tf_session_num_steps",
            "scalar/tf_session_before_train",
            "scalar/tf_session_after_train",
        ]
    hook.close()
    verify_files(trial_dir, save_config, saved_scalars)
Beispiel #7
0
def helper_tensorflow_tests(collection, save_config):
    coll_name, coll_regex = collection

    run_id = "trial_" + coll_name + "-" + datetime.now().strftime(
        "%Y%m%d-%H%M%S%f")
    trial_dir = os.path.join(SMDEBUG_TF_HOOK_TESTS_DIR, run_id)

    hook = TF_Hook(out_dir=trial_dir,
                   include_collections=[coll_name],
                   export_tensorboard=True)
    coll = hook.get_collection(coll_name)
    coll.save_config = save_config
    save_steps = save_config.get_save_config(ModeKeys.TRAIN).save_steps
    if not save_steps:
        save_interval = save_config.get_save_config(
            ModeKeys.TRAIN).save_interval
        save_steps = [i for i in range(0, 10, save_interval)]

    simple_tf_model(hook)
    hook.close()

    saved_scalars = ["loss"]
    check_trials(trial_dir, save_steps, coll_name, saved_scalars)
    check_metrics_file(saved_scalars)
def test_native_tf2_profiling(
    monkeypatch,
    python_profiler_name,
    model_type,
    use_mirrored_strategy,
    get_model,
    native_tf2_cprofile_profiler_config_parser,
    native_tf2_pyinstrument_profiler_config_parser,
    out_dir,
    mnist_dataset,
    tf_eager_mode,
):
    """
    Enable all types of profiling and validate the output artfacts. Parametrizes on the type of Python
    profiler used for Python profiling as well as the model used for training.

    We cannot test dataloader profiling in pytest, because the resource config needs to be configured at
    /opt/ml/input/config/resourceconfig.json before tensorflow is even imported.
    """
    if python_profiler_name == CPROFILE_NAME:
        profiler_config_parser = native_tf2_cprofile_profiler_config_parser
    else:
        profiler_config_parser = native_tf2_pyinstrument_profiler_config_parser

    assert profiler_config_parser.profiling_enabled
    profiler_config_parser.load_config()

    hook = Hook(out_dir=out_dir, save_all=True)
    # Known issue where logging in a python callback function (i.e. atexit) during pytest causes logging errors.
    # See https://github.com/pytest-dev/pytest/issues/5502 for more information.
    hook.profiler_config_parser = profiler_config_parser
    hook.logger.disabled = True

    if use_mirrored_strategy:
        strategy = tf.distribute.MirroredStrategy()
        num_devices = strategy.num_replicas_in_sync
        with strategy.scope():
            model = get_model(model_type)
            optimizer = tf.optimizers.Adam()
        train_step_func = _distributed_train_step
    else:
        strategy = None
        num_devices = 1
        model = get_model(model_type)
        optimizer = tf.optimizers.Adam()
        train_step_func = _train_step

    optimizer = hook.wrap_optimizer(optimizer)
    _training_loop(hook, profiler_config_parser, model, optimizer,
                   mnist_dataset, train_step_func, strategy)

    # Sanity check debugger output
    _verify_tensor_names(out_dir)

    # Validate all timeline files
    _verify_timeline_files(out_dir)

    # Validate detailed profiling
    expected_event_count = 90 if use_mirrored_strategy else 230
    verify_detailed_profiling(out_dir, expected_event_count)

    # The expected number of stats directories during is ((num_steps * 2) + 2) * num_devices. This includes profiling
    # for both phases of each step and pre-step zero python profiling and post-hook-close python profiling.
    expected_stats_dir_count = (
        (profiler_config_parser.config.python_profiling_config.num_steps * 2) +
        2) * num_devices
    python_stats_dir = os.path.join(out_dir, "framework", "tensorflow",
                                    python_profiler_name)
    validate_python_profiling_stats(python_stats_dir, python_profiler_name,
                                    expected_stats_dir_count)