Python SaveConfig Examples, smdebug.SaveConfig Python Examples

Example #1

0

Show file

File: mnist_gluon_realtime_visualize_demo.py Project: maimoonaiqbal2000/sagemaker-debugger

def create_hook():
    # With the following SaveConfig, we will save tensors for every 100 steps
    save_config = SaveConfig(save_interval=100)

    # Create a hook that logs weights, biases and gradients while training the model.
    hook = Hook(save_config=save_config, save_all=True)
    return hook

Example #2

0

Show file

File: test_hook.py Project: vandanavk/sagemaker-debugger

def test_hook_save_every_step(tmpdir):
    save_config = SaveConfig(save_interval=1)
    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir, save_config=save_config)
    run_xgboost_model(hook=hook)
    trial = create_trial(out_dir)
    assert trial.steps() == list(range(10))

Example #3

0

Show file

File: test_keras_lstm.py Project: taza1/sagemaker-debugger

def test_lstm_and_generator(out_dir):
    # init hook
    hook = KerasHook(
        out_dir,
        include_collections=[
            CollectionKeys.WEIGHTS,
            CollectionKeys.LOSSES,
            CollectionKeys.GRADIENTS,
        ],
        save_config=SaveConfig(save_steps=[0, 1, 2, 3]),
    )

    # init model
    num_steps = 100
    hidden_size = 100
    vocabulary = 1000
    model = Sequential()
    model.add(Embedding(vocabulary, hidden_size, input_length=num_steps))
    model.add(LSTM(hidden_size, return_sequences=True))
    model.add(LSTM(hidden_size, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(TimeDistributed(Dense(vocabulary)))
    model.add(Activation("softmax"))

    model.compile(
        loss="categorical_crossentropy",
        optimizer=hook.wrap_optimizer(Adam()),
        metrics=["categorical_accuracy"],
    )

    train(3, 32, model, num_steps, hook)

    tr = create_trial(out_dir)
    assert len(tr.tensor_names(collection=CollectionKeys.LOSSES)) > 0
    assert len(tr.tensor_names(collection=CollectionKeys.WEIGHTS)) > 0

Example #4

0

Show file

def test_loss_collection_with_no_other_collections():
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir = "/tmp/" + run_id
    hook = t_hook(out_dir=out_dir,
                  save_config=save_config,
                  include_collections=[])
    assert has_training_ended(out_dir) == False
    run_mnist_gluon_model(hook=hook,
                          num_steps_train=10,
                          num_steps_eval=10,
                          register_to_loss_block=True)

    print("Created the trial with out_dir {0}".format(out_dir))
    tr = create_trial(out_dir)
    assert tr
    assert len(tr.steps()) == 4

    print(tr.tensor_names())
    tname = tr.tensor_names(regex=".*loss")[0]
    loss_tensor = tr.tensor(tname)
    loss_val = loss_tensor.value(step_num=1)
    assert len(loss_val) > 0

    shutil.rmtree(out_dir)

Example #5

0

Show file

def helper_test_modes(hook=None, out_dir="/tmp/test_output/test_hook_modes/"):
    prefix = str(uuid.uuid4())
    device = torch.device("cpu")
    save_steps = [i for i in range(5)]
    model = Net(to_save=save_steps).to(device)
    json = hook is not None
    if hook is None:
        out_dir = str(Path(out_dir, prefix))
        hook = Hook(
            out_dir=out_dir,
            save_config=SaveConfig({modes.TRAIN: SaveConfigMode(save_steps=save_steps)}),
            include_collections=[
                CollectionKeys.WEIGHTS,
                CollectionKeys.BIASES,
                CollectionKeys.GRADIENTS,
                CollectionKeys.DEFAULT,
                CollectionKeys.LOSSES,
            ],
        )

    hook.register_module(model)
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    hook.set_mode(mode=modes.TRAIN)
    train(model, device, optimizer, num_steps=10, save_steps=save_steps)

    trial = create_trial(path=out_dir, name="test output")

    assert len(trial.modes()) == 1
    assert len(trial.steps()) == 5
    assert len(trial.steps(mode=modes.TRAIN)) == 5
    assert len(trial.steps(mode=modes.EVAL)) == 0

    if hook is None:
        shutil.rmtree(out_dir)

Example #6

0

Show file

def test_hook_all_zero(hook=None, out_dir=None):
    hook_created = False
    if hook is None:
        hook_created = True
        save_config = SaveConfig(save_steps=[0, 1, 2, 3])
        run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
        out_dir = "/tmp/newlogsRunTest/" + run_id
        print("Registering the hook with out_dir {0}".format(out_dir))
        shutil.rmtree(out_dir, ignore_errors=True)
        hook = t_hook(
            out_dir=out_dir,
            save_config=save_config,
            include_collections=[
                "ReluActivation", "weights", "biases", "gradients"
            ],
        )
        hook.get_collection("ReluActivation").include(["relu*", "input_*"])
    run_mnist_gluon_model(hook=hook,
                          num_steps_train=10,
                          num_steps_eval=10,
                          make_input_zero=True)

    print("Created the trial with out_dir {0}".format(out_dir))
    tr = create_trial(out_dir)
    assert tr
    assert len(tr.steps()) == 4

    tnames = tr.tensor_names(regex="conv._input")
    tname = tr.tensor_names(regex="conv._input")[0]
    conv_tensor_value = tr.tensor(tname).value(step_num=0)
    is_zero = np.all(conv_tensor_value == 0)
    assert is_zero == True
    if hook_created:
        shutil.rmtree(out_dir)

Example #7

0

Show file

File: test_hook.py Project: vandanavk/sagemaker-debugger

def test_hook_save_config_collections(tmpdir):
    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir,
                include_collections=["metrics", "feature_importance"])

    hook.get_collection("metrics").save_config = SaveConfig(save_interval=2)
    hook.get_collection("feature_importance").save_config = SaveConfig(
        save_interval=3)

    run_xgboost_model(hook=hook)

    trial = create_trial(out_dir)
    metric_steps = trial.tensor("train-rmse").steps()
    assert all(step % 2 == 0 for step in metric_steps[:-1])
    fimps = [
        t for t in trial.tensor_names() if t.startswith("feature_importance/")
    ]
    fimp_steps = trial.tensor(fimps[0]).steps()
    assert all(step % 3 == 0 for step in fimp_steps[:-1])

Example #8

0

Show file

File: test_simple_write.py Project: thaisep/sagemaker-debugger

def create_hook(output_dir, module=None, hook_type="saveall", save_steps=None):
    # Create a hook that logs weights, biases, gradients and inputs/ouputs of model
    if hook_type == "saveall":
        hook = Hook(out_dir=output_dir,
                    save_config=SaveConfig(save_steps=save_steps),
                    save_all=True)
    elif hook_type == "module-input-output":
        # The names of input and output tensors of a module are in following format
        # Inputs :  <module_name>_input_<input_index>, and
        # Output :  <module_name>_output
        # In order to log the inputs and output of a module, we will create a collection as follows:
        assert module is not None

        # Create a hook that logs weights, biases, gradients and inputs/outputs of model
        hook = Hook(
            out_dir=output_dir,
            save_config=SaveConfig(save_steps=save_steps),
            include_collections=[
                CollectionKeys.WEIGHTS,
                CollectionKeys.GRADIENTS,
                CollectionKeys.BIASES,
                "l_mod",
            ],
        )
        hook.get_collection("l_mod").add_module_tensors(module,
                                                        inputs=True,
                                                        outputs=True)
    elif hook_type == "weights-bias-gradients":
        save_config = SaveConfig(save_steps=save_steps)
        # Create a hook that logs ONLY weights, biases, and gradients
        hook = Hook(
            out_dir=output_dir,
            save_config=save_config,
            include_collections=[
                CollectionKeys.WEIGHTS,
                CollectionKeys.BIASES,
                CollectionKeys.GRADIENTS,
                CollectionKeys.DEFAULT,
                CollectionKeys.LOSSES,
            ],
        )
    return hook

Example #9

0

Show file

def test_hook():
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir = "/tmp/newlogsRunTest/" + run_id
    hook = t_hook(out_dir=out_dir, save_config=save_config)
    assert has_training_ended(out_dir) == False
    run_mnist_gluon_model(hook=hook,
                          num_steps_train=10,
                          num_steps_eval=10,
                          register_to_loss_block=True)
    shutil.rmtree(out_dir)

Example #10

0

Show file

File: xgboost_customer_churn.py Project: tom5610/sagemaker-immersion-day

def create_smdebug_hook(out_dir, train_data=None, validation_data=None, frequency=1, collections=None,):

    save_config = SaveConfig(save_interval=frequency)
    hook = Hook(
        out_dir=out_dir,
        train_data=train_data,
        validation_data=validation_data,
        save_config=save_config,
        include_collections=collections,
    )

    return hook

Example #11

0

Show file

def create_hook_from_json_config(hook_cls,
                                 json_config_path,
                                 default_values=None):
    """Returns a SessionHook object corresponding to either TF, PT, or MXNet.

    If json_config_path is None, an environment variable must be set.
    Here we compare HookParameters with CollectionConfiguration and set all the defaults.
    """
    params_dict = get_json_config_as_dict(json_config_path=json_config_path)
    hook_params = collect_hook_config_params(params_dict)

    out_dir = hook_params.get("out_dir")
    dry_run = hook_params.get("dry_run", False)
    reduction_config = hook_params.get(CONFIG_RDN_CFG_KEY, None)
    save_config = SaveConfig.from_dict(hook_params.get("save_config_modes"),
                                       default_values)
    include_regex = hook_params.get(CONFIG_INCLUDE_REGEX_KEY)
    include_collections = get_include_collections(params_dict)
    save_all = hook_params.get(CONFIG_SAVE_ALL_KEY, False)
    include_workers = hook_params.get(CONFIG_INCLUDE_WORKERS_KEY, "one")

    # If Sagemaker, emit TB only if JSON file exists
    if is_sagemaker_job():
        tensorboard_dir = get_tensorboard_dir_from_json_config()
        export_tensorboard = bool(tensorboard_dir is not None)
    # Otherwise, place TB artifacts in out_dir
    else:
        tensorboard_dir = hook_params[TENSORBOARD_DIR_KEY]
        export_tensorboard = hook_params[EXPORT_TENSORBOARD_KEY]

    hook = hook_cls(
        out_dir=out_dir,
        export_tensorboard=export_tensorboard,
        tensorboard_dir=tensorboard_dir,
        dry_run=dry_run,
        reduction_config=reduction_config,
        save_config=save_config,
        include_regex=include_regex,
        include_collections=include_collections,
        include_workers=include_workers,
        save_all=save_all,
    )
    add_collections_to_manager(hook.collection_manager, params_dict,
                               hook_params)
    return hook

Example #12

0

Show file

File: test_custom_tensor.py Project: thaisep/sagemaker-debugger

def test_hook():
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    run_id = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir = "/tmp/newlogsRunTest/" + run_id
    hook = t_hook(out_dir=out_dir, save_config=save_config)
    run_mnist_gluon_model(
        hook=hook,
        num_steps_train=10,
        num_steps_eval=10,
        register_to_loss_block=True,
        save_custom_tensor=True,
    )
    trial = create_trial(out_dir)
    custom_tensors = trial.tensor_names(collection=CollectionKeys.DEFAULT)
    all_tensors = trial.tensor_names()
    assert len(custom_tensors) == 2
    assert len(all_tensors) == 4
    shutil.rmtree(out_dir)

Example #13

0

Show file

File: test_hook.py Project: vandanavk/sagemaker-debugger

def test_hook_save_all(tmpdir):
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))

    hook = Hook(out_dir=out_dir, save_config=save_config, save_all=True)
    run_xgboost_model(hook=hook)

    trial = create_trial(out_dir)
    collections = trial.collections()
    tensors = trial.tensor_names()
    assert len(tensors) > 0
    assert len(trial.steps()) == 4
    assert "all" in collections
    assert "metrics" in collections
    assert "feature_importance" in collections
    assert "train-rmse" in tensors
    assert any(t.startswith("feature_importance/") for t in tensors)
    assert any(t.startswith("trees/") for t in tensors)
    assert len(collections["all"].tensor_names) == len(tensors)

Example #14

0

Show file

File: test_hook.py Project: vandanavk/sagemaker-debugger

def test_hook(tmpdir):
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir, save_config=save_config)
    assert has_training_ended(out_dir) is False
    run_xgboost_model(hook=hook)

Example #15

0

Show file

File: test_spot_training.py Project: vandanavk/sagemaker-debugger

def test_spot_hook():
    os.environ[
        CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR] = "./tests/mxnet/test_json_configs/checkpointconfig.json"
    checkpoint_path = "/tmp/savedParams"
    if not os.path.exists(checkpoint_path):
        os.mkdir(checkpoint_path)
    save_config = SaveConfig(
        save_steps=[10, 11, 12, 13, 14, 40, 50, 60, 70, 80])
    """
    Run the training for 2 epochs and save the parameter after every epoch.
    We expect that steps 0 to 14 will be written.
    """

    run_id_1 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir_1 = "/tmp/newlogsRunTest/" + run_id_1
    hook = t_hook(out_dir=out_dir_1,
                  save_config=save_config,
                  include_collections=["weights", "gradients"])
    assert has_training_ended(out_dir_1) == False
    run_mnist(
        hook=hook,
        num_steps_train=10,
        num_steps_eval=10,
        epochs=2,
        save_interval=1,
        save_path=checkpoint_path,
    )
    """
    Run the training again for 4 epochs and save the parameter after every epoch.
    We DONOT expect that steps 0 to 14 are written.
    We expect to read steps 40, 50, 60, 70 and 80
    """
    run_id_2 = "trial_" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    out_dir_2 = "/tmp/newlogsRunTest/" + run_id_2
    hook = t_hook(out_dir=out_dir_2,
                  save_config=save_config,
                  include_collections=["weights", "gradients"])
    assert has_training_ended(out_dir_2) == False
    run_mnist(
        hook=hook,
        num_steps_train=10,
        num_steps_eval=10,
        epochs=4,
        save_interval=1,
        save_path=checkpoint_path,
    )
    # Unset the environ variable before validation so that it won't affect the other scripts in py test environment.
    del os.environ[CHECKPOINT_CONFIG_FILE_PATH_ENV_VAR]

    # Validation
    print("Created the trial with out_dir {0} for the first training".format(
        out_dir_1))
    tr = create_trial(out_dir_1)
    assert tr
    available_steps_1 = tr.steps()
    assert 40 not in available_steps_1
    assert 80 not in available_steps_1
    print(available_steps_1)

    print("Created the trial with out_dir {0} for the second training".format(
        out_dir_2))
    tr = create_trial(out_dir_2)
    assert tr
    available_steps_2 = tr.steps()
    assert 40 in available_steps_2
    assert 50 in available_steps_2
    assert 60 in available_steps_2
    assert 70 in available_steps_2
    assert 80 in available_steps_2
    assert 0 not in available_steps_2
    assert 10 not in available_steps_2
    assert 11 not in available_steps_2
    assert 12 not in available_steps_2
    print(available_steps_2)

    print("Cleaning up.")
    shutil.rmtree(os.path.dirname(out_dir_1))
    shutil.rmtree(checkpoint_path, ignore_errors=True)