def test_learn_example_dataset_1(test_input): """Learn from the example database.""" train, _ = load_toy_cancer() _bk = Background(modes=train.modes) _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=test_input) _dn.fit(train) assert len(_dn.estimators_) == test_input
def test_initialize_bad_n_estimators(test_input): """Test bad values for n_estimators""" _dn = BoostedRDN(target="cancer", background=Background(), n_estimators=test_input) with pytest.raises(ValueError): _dn.fit(example_data.train)
def test_learn_example_dataset_1(test_input): """Learn from the example database.""" _bk = Background(modes=example_data.train.modes, use_std_logic_variables=True) _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=test_input) _dn.fit(example_data.train) assert len(_dn.estimators_) == test_input
def test_predict_example_data(test_input): """Test learn and predict.""" train, test = load_toy_cancer() _bk = Background(modes=train.modes) _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=test_input) _dn.fit(train) assert_array_equal(_dn.predict(test), np.array([1.0, 1.0, 1.0, 0.0, 0.0]))
def test_predict_example_data(test_input): """Test learn and predict.""" _bk = Background(modes=example_data.train.modes, use_std_logic_variables=True) _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=test_input) _dn.fit(example_data.train) assert_array_equal(_dn.predict(example_data.test), np.array([1.0, 1.0, 1.0, 0.0, 0.0]))
def test_initialize_bad_neg_pos_ratio(test_input): """Tests bad values for neg_pos_ratio""" _dn = BoostedRDN(target="cancer", background=Background(), neg_pos_ratio=test_input) train, _ = load_toy_cancer() with pytest.raises(ValueError): _dn.fit(train)
def test_cannot_read_outside_length_of_dotfiles(): """Test that invalid tree indexes raise errors.""" train, _ = load_toy_cancer() bkg = Background(modes=train.modes) clf = BoostedRDN(target="cancer", background=bkg) clf.fit(train) for test_input in [-10, -5, -1, 10]: with pytest.raises(IndexError): _ = export_digraph(clf, tree_index=test_input)
def test_predict_proba_test_data(): """Assert arrays are almost equal on output of predict_proba()""" train, test = load_toy_cancer() _bk = Background(modes=train.modes) _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=5) _dn.fit(train) assert_array_almost_equal( _dn.predict_proba(test), np.array([0.74, 0.74, 0.74, 0.25, 0.25]), decimal=2, )
def test_predict_proba_test_data(): """Assert arrays are almost equal on output of predict_proba()""" _bk = Background(modes=example_data.train.modes, use_std_logic_variables=True) _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=5) _dn.fit(example_data.train) assert_array_almost_equal( _dn.predict_proba(example_data.test), np.array([0.74, 0.74, 0.74, 0.25, 0.25]), decimal=2, )
def test_feature_importances_toy_cancer(): """Test getting the feature importances from the Toy-Cancer set.""" train, _ = load_toy_cancer() bkg = Background(modes=train.modes) rdn = BoostedRDN( target="cancer", background=bkg, n_estimators=10, ) rdn.fit(train) _features = rdn.feature_importances_ assert _features.most_common(1)[0] == ("smokes", 10)
def test_serialize_BoostedRDN(tmpdir): """Test that inference is possible after loading from json""" output_json = tmpdir.join("ToyCancerRDN.json") train, test = load_toy_cancer() bkg = Background(modes=train.modes) rdn = BoostedRDN(background=bkg, target="cancer", n_estimators=5) rdn.fit(train) rdn.to_json(output_json) # New BoostedRDN instance, loading from file, and running. rdn2 = BoostedRDN() rdn2.from_json(output_json) _predictions = rdn2.predict(test) assert len(rdn2.estimators_) == 5 assert_array_equal(_predictions, np.array([1.0, 1.0, 1.0, 0.0, 0.0]))
"male(+name).", "father(+name,+name).", "childof(+name,+name).", "siblingof(+name,+name)." ], number_of_clauses=8, use_prolog_variables=True, ) clf = BoostedRDN( background=bk, target="father", n_estimators=5, ) clf.fit(train_db) test_db = Database() test_db.pos = [ "father(elizabeth,mrbennet).", "father(jane,mrbennet).", "father(charlotte,mrlucas).", ] test_db.neg = [ "father(charlotte,mrsbennet).", "father(jane,mrlucas).", "father(mrsbennet,mrbennet).", "father(jane,elizabeth).", ]
import matplotlib.pyplot as plt bk = Background( modes=example_data.train.modes, use_std_logic_variables=True, ) clf = BoostedRDN( background=bk, target='cancer', max_tree_depth=2, node_size=2, n_estimators=20, ) clf.fit(example_data.train) x = np.arange(1, 21) y_pos = [] y_neg = [] thresholds = [] for n_trees in x: clf.set_params(n_estimators=n_trees) probs = clf.predict_proba(example_data.test) thresholds.append(clf.threshold_) y_pos.append(np.mean(probs[np.nonzero(clf.classes_)])) y_neg.append(np.mean(probs[clf.classes_ == 0])) thresholds = np.array(thresholds)
"actor(+person).", "movie(+movie, +person).", "movie(+movie, -person).", "movie(-movie, +person).", "female_gender(+person).", "genre(+person, +genre).", "genre(+person, #genre).", "genre(+person, -genre).", "genre(-person, +genre).", "workedunder(+person, +person).", "workedunder(+person, -person).", "workedunder(-person, +person)." ], ) clf = BoostedRDN( background=bk, target="workedunder", node_size=3, max_tree_depth=3, ) _start = time.perf_counter() clf.fit(db) _end = time.perf_counter() _difference = _end - _start print(_difference) _train_times.append(_difference) print("Mean runtime for `clf.fit()` and standard deviation:") print(np.mean(_train_times)) print(np.std(_train_times)) print("Raw numbers", _train_times)
def test_initialize_bad_background(test_input): """Test bad input for background""" _dn = BoostedRDN(target="cancer", background=test_input) with pytest.raises(ValueError): _dn.fit(example_data.train)
def test_initialize_bad_target(test_input): """Initialize an RDN with incorrect target values.""" _dn = BoostedRDN(target=test_input) with pytest.raises(ValueError): _dn.fit(example_data.train)
bk = Background( modes=[ "male(+name).", "father(+name,+name).", "childof(+name,+name).", "siblingof(+name,+name)." ], node_size=1, number_of_clauses=8, ) clf = BoostedRDN( background=bk, target="father", n_estimators=5, ) clf.fit(train) # %% # It's important to check whether we actually learn something useful. # We'll visually inspect the relational regression trees to see what # they learned. from srlearn.plotting import plot_digraph from srlearn.plotting import export_digraph plot_digraph(export_digraph(clf, 0), format="html") # %% # There is some variance between runs, but in the concept that the # trees pick up on is roughly that "*A father has a child and is male.*"
def test_initialize_bad_target(test_input): """Initialize an RDN with incorrect target values.""" _dn = BoostedRDN(target=test_input) train, _ = load_toy_cancer() with pytest.raises(ValueError): _dn.fit(train)
def test_initialize_bad_background(test_input): """Test bad input for background""" _dn = BoostedRDN(target="cancer", background=test_input) train, _ = load_toy_cancer() with pytest.raises(ValueError): _dn.fit(train)