def test_hook_shap(tmpdir):
    np.random.seed(42)
    train_data = np.random.rand(10, 10)
    train_label = np.random.randint(2, size=10)
    dtrain = xgboost.DMatrix(train_data, label=train_label)

    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir,
                include_collections=["average_shap", "full_shap"],
                train_data=dtrain)
    run_xgboost_model(hook=hook)

    trial = create_trial(out_dir)
    tensors = trial.tensor_names()
    assert len(tensors) > 0
    assert "average_shap" in trial.collections()
    assert "full_shap" in trial.collections()
    assert any(t.startswith("average_shap/") for t in tensors)
    assert any(t.startswith("full_shap/") for t in tensors)
    assert not any(t.endswith("/bias") for t in tensors)
    average_shap_tensors = [
        t for t in tensors if t.startswith("average_shap/")
    ]
    average_shap_tensor_name = average_shap_tensors.pop()
    assert trial.tensor(average_shap_tensor_name).value(0).shape == (1, )
    full_shap_tensors = [t for t in tensors if t.startswith("full_shap/")]
    full_shap_tensor_name = full_shap_tensors.pop()
    # full shap values should have 10 rows with 10 features + 1 bias
    assert trial.tensor(full_shap_tensor_name).value(0).shape == (10, 11)
def test_hook_validation(tmpdir):
    np.random.seed(42)
    train_data = np.random.rand(5, 10)
    train_label = np.random.randint(2, size=5)
    dtrain = xgboost.DMatrix(train_data, label=train_label)
    valid_data = np.random.rand(5, 10)
    valid_label = np.random.randint(2, size=5)
    dvalid = xgboost.DMatrix(valid_data, label=valid_label)

    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(
        out_dir=out_dir,
        include_collections=["labels", "predictions"],
        train_data=dtrain,
        validation_data=dvalid,
    )
    run_xgboost_model(hook=hook)

    trial = create_trial(out_dir)
    tensors = trial.tensor_names()
    assert len(tensors) > 0
    assert "labels" in trial.collections()
    assert "predictions" in trial.collections()
    assert "labels" in tensors
    assert "predictions" in tensors
def test_hook_save_every_step(tmpdir):
    save_config = SaveConfig(save_interval=1)
    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir, save_config=save_config)
    run_xgboost_model(hook=hook)
    trial = create_trial(out_dir)
    assert trial.steps() == list(range(10))
Example #4
0
def test_hook_from_json_config_for_losses(tmpdir, monkeypatch, params):
    out_dir = tmpdir.join("test_hook_from_json_config_for_losses")
    config_file = tmpdir.join("config.json")
    config_file.write(get_json_config_for_losses(str(out_dir)))
    monkeypatch.setenv(CONFIG_FILE_PATH_ENV_STR, str(config_file))
    hook = Hook.create_from_json_file()
    assert has_training_ended(out_dir) is False
    run_xgboost_model(hook=hook, params=params)
    trial = create_trial(str(out_dir))
    eval_metric = params["eval_metric"]
    test_metric = f"test-{eval_metric}"
    train_metric = f"train-{eval_metric}"
    if eval_metric == "rmse":
        assert train_metric in trial.tensor_names(
            collection=CollectionKeys.METRICS)
        assert train_metric in trial.tensor_names(
            collection=CollectionKeys.LOSSES)
        assert test_metric in trial.tensor_names(
            collection=CollectionKeys.METRICS)
        assert test_metric in trial.tensor_names(
            collection=CollectionKeys.LOSSES)
    if eval_metric == "auc" or eval_metric == "map":
        assert train_metric in trial.tensor_names(
            collection=CollectionKeys.METRICS)
        assert train_metric not in trial.tensor_names(
            collection=CollectionKeys.LOSSES)
        assert test_metric in trial.tensor_names(
            collection=CollectionKeys.METRICS)
        assert test_metric not in trial.tensor_names(
            collection=CollectionKeys.LOSSES)
def test_hook_from_json_config_full(tmpdir, monkeypatch):
    out_dir = tmpdir.join("test_hook_from_json_config_full")
    config_file = tmpdir.join("config.json")
    config_file.write(get_json_config_full(str(out_dir)))
    monkeypatch.setenv(CONFIG_FILE_PATH_ENV_STR, str(config_file))
    hook = Hook.create_from_json_file()
    assert has_training_ended(out_dir) is False
    run_xgboost_model(hook=hook)
def main():

    args = parse_args()
    train_files_path, validation_files_path = args.train, args.validation

    train_features_path = os.path.join(args.train, 'train_features.csv')
    train_labels_path = os.path.join(args.train, 'train_labels.csv')

    val_features_path = os.path.join(args.validation, 'val_features.csv')
    val_labels_path = os.path.join(args.validation, 'val_labels.csv')

    print('Loading training dataframes...')
    df_train_features = pd.read_csv(train_features_path, header=None)
    df_train_labels = pd.read_csv(train_labels_path, header=None)

    print('Loading validation dataframes...')
    df_val_features = pd.read_csv(val_features_path, header=None)
    df_val_labels = pd.read_csv(val_labels_path, header=None)

    X = df_train_features.values
    y = df_train_labels.values.reshape(-1)

    val_X = df_val_features.values
    val_y = df_val_labels.values.reshape(-1)

    print('Train features shape: {}'.format(X.shape))
    print('Train labels shape: {}'.format(y.shape))
    print('Validation features shape: {}'.format(val_X.shape))
    print('Validation labels shape: {}'.format(val_y.shape))

    dtrain = xgboost.DMatrix(X, label=y)
    dval = xgboost.DMatrix(val_X, label=val_y)

    hook = Hook.create_from_json_file()
    hook.train_data = dtrain
    hook.validation_data = dval

    watchlist = [(dtrain, "train"), (dval, "validation")]

    params = {
        "max_depth": args.max_depth,
        "eta": args.eta,
        "gamma": args.gamma,
        "min_child_weight": args.min_child_weight,
        "silent": args.silent,
        "objective": args.objective,
        "eval_metric": args.eval_metric
    }

    bst = xgboost.train(params=params,
                        dtrain=dtrain,
                        evals=watchlist,
                        num_boost_round=args.num_round,
                        callbacks=[hook])

    model_dir = os.environ.get('SM_MODEL_DIR')
    pkl.dump(bst, open(model_dir + '/model.bin', 'wb'))
def helper_xgboost_tests(collection, save_config):
    coll_name, coll_regex = collection

    run_id = "trial_" + coll_name + "-" + datetime.now().strftime("%Y%m%d-%H%M%S%f")
    trial_dir = os.path.join(SMDEBUG_XG_HOOK_TESTS_DIR, run_id)

    hook = XG_Hook(
        out_dir=trial_dir,
        include_collections=[coll_name],
        save_config=save_config,
        export_tensorboard=True,
    )

    simple_xg_model(hook)
    hook.close()

    saved_scalars = ["scalar/xg_num_steps", "scalar/xg_before_train", "scalar/xg_after_train"]
    verify_files(trial_dir, save_config, saved_scalars)
def helper_xgboost_tests(collection, save_config, with_timestamp):
    coll_name, coll_regex = collection

    run_id = "trial_" + coll_name + "-" + datetime.now().strftime(
        "%Y%m%d-%H%M%S%f")
    trial_dir = os.path.join(SMDEBUG_XG_HOOK_TESTS_DIR, run_id)

    hook = XG_Hook(
        out_dir=trial_dir,
        include_collections=[coll_name],
        save_config=save_config,
        export_tensorboard=True,
    )

    saved_scalars = simple_xg_model(hook, with_timestamp=with_timestamp)
    hook.close()
    verify_files(trial_dir, save_config, saved_scalars)
    if with_timestamp:
        check_tf_events(trial_dir, saved_scalars)
def test_hook_save_config_collections(tmpdir):
    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir,
                include_collections=["metrics", "feature_importance"])

    hook.get_collection("metrics").save_config = SaveConfig(save_interval=2)
    hook.get_collection("feature_importance").save_config = SaveConfig(
        save_interval=3)

    run_xgboost_model(hook=hook)

    trial = create_trial(out_dir)
    metric_steps = trial.tensor("train-rmse").steps()
    assert all(step % 2 == 0 for step in metric_steps[:-1])
    fimps = [
        t for t in trial.tensor_names() if t.startswith("feature_importance/")
    ]
    fimp_steps = trial.tensor(fimps[0]).steps()
    assert all(step % 3 == 0 for step in fimp_steps[:-1])
def create_smdebug_hook(out_dir, train_data=None, validation_data=None, frequency=1, collections=None,):

    save_config = SaveConfig(save_interval=frequency)
    hook = Hook(
        out_dir=out_dir,
        train_data=train_data,
        validation_data=validation_data,
        save_config=save_config,
        include_collections=collections,
    )

    return hook
Example #11
0
def main():

    args = parse_args()
    train_files_path, validation_files_path = args.train, args.validation

    train_files_list = glob.glob(train_files_path + '/*.*')
    print(train_files_list)

    val_files_list = glob.glob(validation_files_path + '/*.*')
    print(val_files_list)

    print('Loading training data...')
    df_train = pd.concat(map(pd.read_csv, train_files_list))
    print('Loading validation data...')
    df_val = pd.concat(map(pd.read_csv, val_files_list))
    print('Data loading completed.')

    y = df_train.Target.values
    X = df_train.drop(['Target'], axis=1).values
    val_y = df_val.Target.values
    val_X = df_val.drop(['Target'], axis=1).values

    dtrain = xgboost.DMatrix(X, label=y)
    dval = xgboost.DMatrix(val_X, label=val_y)

    params = {
        "max_depth": args.max_depth,
        "eta": args.eta,
        "gamma": args.gamma,
        "min_child_weight": args.min_child_weight,
        "silent": args.silent,
        "objective": args.objective,
        "num_class": args.num_class
    }

    hook = Hook.create_from_json_file()
    hook.train_data = dtrain
    hook.validation_data = dval

    watchlist = [(dtrain, "train"), (dval, "validation")]

    bst = xgboost.train(params=params,
                        dtrain=dtrain,
                        evals=watchlist,
                        num_boost_round=args.num_round,
                        callbacks=[hook])

    model_dir = os.environ.get('SM_MODEL_DIR')
    pkl.dump(bst, open(model_dir + '/model.bin', 'wb'))
Example #12
0
def test_hook_tree_model(tmpdir):
    np.random.seed(42)
    train_data = np.random.rand(5, 10)
    train_label = np.random.randint(2, size=5)
    dtrain = xgboost.DMatrix(train_data, label=train_label)
    params = {"objective": "binary:logistic"}
    bst = xgboost.train(params, dtrain, num_boost_round=0)
    df = bst.trees_to_dataframe()

    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir, include_collections=["trees"])
    run_xgboost_model(hook=hook)

    trial = create_trial(out_dir)
    tensors = trial.tensor_names()
    assert len(tensors) > 0
    assert "trees" in trial.collections()
    for col in df.columns:
        assert "trees/{}".format(col) in tensors
Example #13
0
def test_hook_save_all(tmpdir):
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))

    hook = Hook(out_dir=out_dir, save_config=save_config, save_all=True)
    run_xgboost_model(hook=hook)

    trial = create_trial(out_dir)
    collections = trial.collections()
    tensors = trial.tensor_names()
    assert len(tensors) > 0
    assert len(trial.steps()) == 4
    assert "all" in collections
    assert "metrics" in collections
    assert "feature_importance" in collections
    assert "train-rmse" in tensors
    assert any(t.startswith("feature_importance/") for t in tensors)
    assert any(t.startswith("trees/") for t in tensors)
    assert len(collections["all"].tensor_names) == len(tensors)
Example #14
0
def test_hook_feature_importance(tmpdir):
    np.random.seed(42)
    train_data = np.random.rand(10, 10)
    train_label = np.random.randint(2, size=10)
    dtrain = xgboost.DMatrix(train_data, label=train_label)

    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir, include_collections=["feature_importance"])
    run_xgboost_model(hook=hook)

    trial = create_trial(out_dir)
    tensors = trial.tensor_names()
    assert len(tensors) > 0
    assert "feature_importance" in trial.collections()
    assert any(t.startswith("feature_importance/") for t in tensors)
    assert any(t.startswith("feature_importance/weight/") for t in tensors)
    assert any(t.startswith("feature_importance/gain/") for t in tensors)
    assert any(t.startswith("feature_importance/cover/") for t in tensors)
    assert any(t.startswith("feature_importance/total_gain/") for t in tensors)
    assert any(
        t.startswith("feature_importance/total_cover/") for t in tensors)
def add_debugging(callbacks, hyperparameters, train_dmatrix,
                  val_dmatrix=None, json_config_path=None):
    """Add a sagemaker debug hook to a list of callbacks.
    :param callbacks: List of callback functions.
    :param hyperparameters: Dict of hyperparamters.
                            Same as `params` in xgb.train(params, dtrain).
    :param train_dmatrix: Training data set.
    :param val_dmatrix: Validation data set.
    :param json_config_path: If specified, this json config will be used
                             instead of default config file.
    """
    try:
        hook = Hook.hook_from_config(json_config_path)
        hook.hyperparameters = hyperparameters
        hook.train_data = train_dmatrix
        if val_dmatrix is not None:
            hook.validation_data = val_dmatrix
        callbacks.append(hook)
        logging.info("Debug hook created from config")
    except Exception as e:
        logging.debug("Failed to create debug hook", e)
        return
Example #16
0
def test_hook_params(tmpdir):
    np.random.seed(42)
    train_data = np.random.rand(5, 10)
    train_label = np.random.randint(2, size=5)
    dtrain = xgboost.DMatrix(train_data, label=train_label)
    valid_data = np.random.rand(5, 10)
    valid_label = np.random.randint(2, size=5)
    dvalid = xgboost.DMatrix(valid_data, label=valid_label)
    params = {"objective": "binary:logistic", "num_round": 20, "eta": 0.1}

    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir,
                include_collections=["hyperparameters"],
                hyperparameters=params)
    run_xgboost_model(hook=hook)

    trial = create_trial(out_dir)
    tensors = trial.tensor_names()
    assert len(tensors) > 0
    assert "hyperparameters" in trial.collections()
    assert trial.tensor("hyperparameters/objective").value(
        0) == "binary:logistic"
    assert trial.tensor("hyperparameters/num_round").value(0) == 20
    assert trial.tensor("hyperparameters/eta").value(0) == 0.1
Example #17
0
def test_setting_mode(tmpdir):
    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir, export_tensorboard=True)
    hook.set_mode(modes.GLOBAL)
    with pytest.raises(ValueError):
        hook.set_mode("a")
Example #18
0
def test_hook_tensorboard_dir_created(tmpdir):
    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir, export_tensorboard=True)
    run_xgboost_model(hook=hook)
    assert "tensorboard" in os.listdir(out_dir)
Example #19
0
def test_hook(tmpdir):
    save_config = SaveConfig(save_steps=[0, 1, 2, 3])
    out_dir = os.path.join(tmpdir, str(uuid.uuid4()))
    hook = Hook(out_dir=out_dir, save_config=save_config)
    assert has_training_ended(out_dir) is False
    run_xgboost_model(hook=hook)