def test_learn_example_dataset_1(test_input): """Learn from the example database.""" train, _ = load_toy_cancer() _bk = Background(modes=train.modes) _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=test_input) _dn.fit(train) assert len(_dn.estimators_) == test_input
def test_initialize_bad_n_estimators(test_input): """Test bad values for n_estimators""" _dn = BoostedRDN(target="cancer", background=Background(), n_estimators=test_input) with pytest.raises(ValueError): _dn.fit(example_data.train)
def test_learn_example_dataset_1(test_input): """Learn from the example database.""" _bk = Background(modes=example_data.train.modes, use_std_logic_variables=True) _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=test_input) _dn.fit(example_data.train) assert len(_dn.estimators_) == test_input
def test_predict_example_data(test_input): """Test learn and predict.""" train, test = load_toy_cancer() _bk = Background(modes=train.modes) _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=test_input) _dn.fit(train) assert_array_equal(_dn.predict(test), np.array([1.0, 1.0, 1.0, 0.0, 0.0]))
def test_predict_example_data(test_input): """Test learn and predict.""" _bk = Background(modes=example_data.train.modes, use_std_logic_variables=True) _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=test_input) _dn.fit(example_data.train) assert_array_equal(_dn.predict(example_data.test), np.array([1.0, 1.0, 1.0, 0.0, 0.0]))
def test_initialize_bad_neg_pos_ratio(test_input): """Tests bad values for neg_pos_ratio""" _dn = BoostedRDN(target="cancer", background=Background(), neg_pos_ratio=test_input) train, _ = load_toy_cancer() with pytest.raises(ValueError): _dn.fit(train)
def test_toy_cancer_predict_after_load(test_input): """Load a ToyCancer json file and predict.""" clf = BoostedRDN() clf.from_json( "srlearn/tests/regression_tests/json/toy_cancer_{0}.json".format( test_input)) _, test = load_toy_cancer() _predictions = clf.predict(test) assert_array_equal(_predictions, np.array([1.0, 1.0, 1.0, 0.0, 0.0]))
def test_cannot_read_outside_length_of_dotfiles(): """Test that invalid tree indexes raise errors.""" train, _ = load_toy_cancer() bkg = Background(modes=train.modes) clf = BoostedRDN(target="cancer", background=bkg) clf.fit(train) for test_input in [-10, -5, -1, 10]: with pytest.raises(IndexError): _ = export_digraph(clf, tree_index=test_input)
def test_predict_proba_test_data(): """Assert arrays are almost equal on output of predict_proba()""" train, test = load_toy_cancer() _bk = Background(modes=train.modes) _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=5) _dn.fit(train) assert_array_almost_equal( _dn.predict_proba(test), np.array([0.74, 0.74, 0.74, 0.25, 0.25]), decimal=2, )
def test_predict_proba_test_data(): """Assert arrays are almost equal on output of predict_proba()""" _bk = Background(modes=example_data.train.modes, use_std_logic_variables=True) _dn = BoostedRDN(background=_bk, target="cancer", n_estimators=5) _dn.fit(example_data.train) assert_array_almost_equal( _dn.predict_proba(example_data.test), np.array([0.74, 0.74, 0.74, 0.25, 0.25]), decimal=2, )
def test_feature_importances_toy_cancer(): """Test getting the feature importances from the Toy-Cancer set.""" train, _ = load_toy_cancer() bkg = Background(modes=train.modes) rdn = BoostedRDN( target="cancer", background=bkg, n_estimators=10, ) rdn.fit(train) _features = rdn.feature_importances_ assert _features.most_common(1)[0] == ("smokes", 10)
neg="datasets/imdb/train1/train1_neg.txt", facts="datasets/imdb/train1/train1_facts.txt", ) bk = Background(modes=[ "actor(+person).", "movie(+movie, +person).", "movie(+movie, -person).", "movie(-movie, +person).", "female_gender(+person).", "genre(+person, +genre).", "genre(+person, #genre).", "genre(+person, -genre).", "genre(-person, +genre).", "workedunder(+person, +person).", "workedunder(+person, -person).", "workedunder(-person, +person)." ], ) clf = BoostedRDN( background=bk, target="workedunder", node_size=3, max_tree_depth=3, ) _start = time.perf_counter() clf.fit(db) _end = time.perf_counter() _difference = _end - _start print(_difference) _train_times.append(_difference) print("Mean runtime for `clf.fit()` and standard deviation:") print(np.mean(_train_times))
from srlearn.rdn import BoostedRDN from srlearn import Background bk = Background( modes=[ "male(+name).", "father(+name,+name).", "childof(+name,+name).", "siblingof(+name,+name)." ], node_size=1, number_of_clauses=8, ) clf = BoostedRDN( background=bk, target="father", n_estimators=5, ) clf.fit(train) # %% # It's important to check whether we actually learn something useful. # We'll visually inspect the relational regression trees to see what # they learned. from srlearn.plotting import plot_digraph from srlearn.plotting import export_digraph plot_digraph(export_digraph(clf, 0), format="html")
def test_feature_importances_before_fit(): """Test that one cannot get feature importances before fit.""" rdn = BoostedRDN() with pytest.raises(ValueError): rdn.feature_importances_
def test_bad_shell_command(): """Test running a shell command which cannot exit 0""" _dn = BoostedRDN() _call = "git bat" with pytest.raises(RuntimeError): _dn._call_shell_command(_call)
def test_initialize_bad_target(test_input): """Initialize an RDN with incorrect target values.""" _dn = BoostedRDN(target=test_input) train, _ = load_toy_cancer() with pytest.raises(ValueError): _dn.fit(train)
def test_initialize_bad_target(test_input): """Initialize an RDN with incorrect target values.""" _dn = BoostedRDN(target=test_input) with pytest.raises(ValueError): _dn.fit(example_data.train)
def test_initialize_rdn_trees(test_input): """Initialize an RDN with various tree numbers.""" _dn = BoostedRDN(n_estimators=test_input) assert _dn.n_estimators == test_input
def test_initialize_rdn_1(): """Initialize an RDN with default parameters.""" _dn = BoostedRDN() assert _dn.target == "None" assert _dn.n_estimators == 10
"courseprof(+course,-person).", "courseta(+course,-person).", "courseta(-course,+person).", "project(-proj,+person).", "project(+proj,-person).", "sameperson(-person,+person).", "faculty(+person).", "student(+person).", ], number_of_clauses=8, ) clf = BoostedRDN( background=bkg, target="faculty", max_tree_depth=3, node_size=3, n_estimators=10, ) clf.fit(train) # %% # The built-in ``feature_importances_`` attribute of a fit classifier is a # Counter of how many times a features appears across the trees: clf.feature_importances_ # %% # These should generally be looked at while looking at the trees, so we'll # plot the first tree here as well.
) bk = Background(modes=[ "courseprof(-Course, +Person).", "courseprof(+Course, -Person).", "courseta(+Course, -Person).", "courseta(-Course, +Person).", "faculty(+Person).", "project(-Proj, +Person).", "project(+Proj, -Person).", "sameperson(-Person, +Person).", ], ) clf = BoostedRDN( background=bk, target="faculty", node_size=2, max_tree_depth=3, ) _start = time.perf_counter() clf.fit(db) _end = time.perf_counter() _difference = _end - _start print(_difference) _train_times.append(_difference) print("Mean runtime for `clf.fit()` and standard deviation:") print(np.mean(_train_times))
def test_serialize_BoostedRDN(tmpdir): """Test that inference is possible after loading from json""" output_json = tmpdir.join("ToyCancerRDN.json") train, test = load_toy_cancer() bkg = Background(modes=train.modes) rdn = BoostedRDN(background=bkg, target="cancer", n_estimators=5) rdn.fit(train) rdn.to_json(output_json) # New BoostedRDN instance, loading from file, and running. rdn2 = BoostedRDN() rdn2.from_json(output_json) _predictions = rdn2.predict(test) assert len(rdn2.estimators_) == 5 assert_array_equal(_predictions, np.array([1.0, 1.0, 1.0, 0.0, 0.0]))
def test_initialize_bad_background(test_input): """Test bad input for background""" _dn = BoostedRDN(target="cancer", background=test_input) train, _ = load_toy_cancer() with pytest.raises(ValueError): _dn.fit(train)
from srlearn.rdn import BoostedRDN from srlearn import Background from srlearn import example_data import numpy as np import matplotlib.pyplot as plt bk = Background( modes=example_data.train.modes, use_std_logic_variables=True, ) clf = BoostedRDN( background=bk, target='cancer', max_tree_depth=2, node_size=2, n_estimators=20, ) clf.fit(example_data.train) x = np.arange(1, 21) y_pos = [] y_neg = [] thresholds = [] for n_trees in x: clf.set_params(n_estimators=n_trees) probs = clf.predict_proba(example_data.test)
] bk = Background( modes=[ "male(+name).", "father(+name,+name).", "childof(+name,+name).", "siblingof(+name,+name)." ], number_of_clauses=8, use_prolog_variables=True, ) clf = BoostedRDN( background=bk, target="father", n_estimators=5, ) clf.fit(train_db) test_db = Database() test_db.pos = [ "father(elizabeth,mrbennet).", "father(jane,mrbennet).", "father(charlotte,mrlucas).", ] test_db.neg = [ "father(charlotte,mrsbennet).",
def test_initialize_bad_background(test_input): """Test bad input for background""" _dn = BoostedRDN(target="cancer", background=test_input) with pytest.raises(ValueError): _dn.fit(example_data.train)