Example #1
0
def test_predict_proba_test_data():
    """Assert arrays are almost equal on output of predict_proba()"""
    _bk = Background(modes=example_data.train.modes,
                     use_std_logic_variables=True)
    _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=5)
    _dn.fit(example_data.train)
    assert_array_almost_equal(
        _dn.predict_proba(example_data.test),
        np.array([0.74, 0.74, 0.74, 0.25, 0.25]),
        decimal=2,
    )
Example #2
0
def test_predict_proba_test_data():
    """Assert arrays are almost equal on output of predict_proba()"""
    train, test = load_toy_cancer()
    _bk = Background(modes=train.modes)
    _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=5)
    _dn.fit(train)
    assert_array_almost_equal(
        _dn.predict_proba(test),
        np.array([0.74, 0.74, 0.74, 0.25, 0.25]),
        decimal=2,
    )
Example #3
0
test_db = Database()

test_db.pos = [
    "father(elizabeth,mrbennet).",
    "father(jane,mrbennet).",
    "father(charlotte,mrlucas).",
]

test_db.neg = [
    "father(charlotte,mrsbennet).",
    "father(jane,mrlucas).",
    "father(mrsbennet,mrbennet).",
    "father(jane,elizabeth).",
]

test_db.facts = [
    "male(mrbennet).",
    "male(mrlucas).",
    "male(darcy).",
    "childof(mrbennet,elizabeth).",
    "childof(mrsbennet,elizabeth).",
    "childof(mrbennet,jane).",
    "childof(mrsbennet,jane).",
    "childof(mrlucas,charlotte).",
    "childof(mrslucas,charlotte).",
    "siblingof(jane,elizabeth).",
    "siblingof(elizabeth,jane).",
]

print(clf.predict_proba(test_db))
    target='cancer',
    max_tree_depth=2,
    node_size=2,
    n_estimators=20,
)

clf.fit(example_data.train)

x = np.arange(1, 21)
y_pos = []
y_neg = []
thresholds = []

for n_trees in x:
    clf.set_params(n_estimators=n_trees)
    probs = clf.predict_proba(example_data.test)

    thresholds.append(clf.threshold_)
    y_pos.append(np.mean(probs[np.nonzero(clf.classes_)]))
    y_neg.append(np.mean(probs[clf.classes_ == 0]))

thresholds = np.array(thresholds)
y_pos = np.array(y_pos)
y_neg = np.array(y_neg)

plt.plot(x, y_pos, "b-", label="Mean Probability of positive examples")
plt.plot(x, y_neg, "r-", label="Mean Probability of negative examples")
plt.plot(x, thresholds, "k--", label="Margin")
plt.title("Class Probability vs. Number Trees")
plt.xlabel("Number of Trees")
plt.ylabel("Probability of belonging to Positive Class")
plot_digraph(export_digraph(clf, 0), format="html")

# %%
# There is some variance between runs, but in the concept that the
# trees pick up on is roughly that "*A father has a child and is male.*"

plot_digraph(export_digraph(clf, 1), format="html")

# %%
# Here the data is fairly complete, and the concept that "*A father has a
# child and is male*" seems sufficient for the purposes of this data.
# Let's apply our learned model to the test data, which includes facts
# about characters from Jane Austen's *Pride and Prejudice.*

predictions = clf.predict_proba(test)

print("{:<35} {}".format("Predicate", "Probability of being True"), "\n",
      "-" * 60)
for predicate, prob in zip(test.pos + test.neg, predictions):
    print("{:<35} {:.2f}".format(predicate, prob))

# %%
# The confidence might be a little low, which is a good excuse to mention
# one of the hyperparameters. "Node Size," or ``node_size`` corresponds to
# the maximum number of predicates that can be used as a split in the
# dependency network. We set ``node_size=1`` above for demonstration, but the
# concept that seems to be learned: ``father(A, B) = [childof(B, A), male(B)]``
# is of size 2.
#
# We might be able to learn a better model by taking this new information
    target="cancer",
    max_tree_depth=2,
    node_size=2,
    n_estimators=20,
)

clf.fit(train)

x = np.arange(1, 21)
y_pos = []
y_neg = []
thresholds = []

for n_trees in x:
    clf.set_params(n_estimators=n_trees)
    probs = clf.predict_proba(test)

    thresholds.append(clf.threshold_)
    y_pos.append(np.mean(probs[np.nonzero(clf.classes_)]))
    y_neg.append(np.mean(probs[clf.classes_ == 0]))

thresholds = np.array(thresholds)
y_pos = np.array(y_pos)
y_neg = np.array(y_neg)

plt.plot(x, y_pos, "b-", label="Mean Probability of positive examples")
plt.plot(x, y_neg, "r-", label="Mean Probability of negative examples")
plt.plot(x, thresholds, "k--", label="Margin")
plt.title("Class Probability vs. Number Trees")
plt.xlabel("Number of Trees")
plt.ylabel("Probability of belonging to Positive Class")