def test_create_selectors_with_nan(self): df = pd.DataFrame.from_dict({ 'A': np.array([np.nan, np.nan, np.nan]), 'B': np.array([10, np.nan, np.nan]) }) result = ps.create_selectors(df) A_null = ps.EqualitySelector('A', np.nan) B_null = ps.EqualitySelector('B', np.nan) B_10 = ps.EqualitySelector('B', 10.) assert A_null in result assert B_null in result assert B_10 in result
import pandas as pd import pysubgroup as ps data = pd.read_table("../data/titanic.csv") target = ps.BinaryTarget('Survived', 0) search_space = ps.create_selectors(data, ignore=['Survived']) task = ps.SubgroupDiscoveryTask(data, target, search_space, result_set_size=5, depth=2, qf=ps.CombinedInterestingnessMeasure([ps.StandardQF(1), ps.GeneralizationAware_StandardQF(1)])) result = ps.SimpleDFS().execute(task, use_optimistic_estimates=False) print(result.to_dataframe())
from timeit import default_timer as timer import pysubgroup as ps from pysubgroup.tests.DataSets import get_credit_data data = get_credit_data() print("running") target = ps.BinaryTarget('class', b'bad') search_space = ps.create_selectors(data, ignore=['class']) task = ps.SubgroupDiscoveryTask(data, target, search_space, result_set_size=10, depth=3, qf=ps.ChiSquaredQF(direction="bidirect")) start = timer() result = ps.SimpleDFS().execute(task) end = timer() print("Time elapsed: ", (end - start)) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroup_description)) # print WRAccQF().evaluate_from_dataset(data, Subgroup(target, []))
from pysubgroup.tests.DataSets import get_titanic_data data = get_titanic_data() select = data.columns[data.dtypes.transform(lambda x: True if x in ["float64", "int64"] else False)] X = data[set(select) - set(["Survived"])] y = data["Survived"] y = y.loc[X.isna().sum(axis=1) == 0] X = X.loc[X.isna().sum(axis=1) == 0][["Age", "Fare", "Pclass"]] #create simple model to test PredictionQF RF = rf_classifier.fit(X[["Age", "Fare", "Pclass"]], y) y_hat = RF.predict_proba(X)[:,1] target = ps.PredictionTarget(y.to_numpy(), y_hat, roc_auc_score) searchspace = ps.create_selectors(X[["Age","Fare", "Pclass"]], ignore=['Survived']) task = ps.SubgroupDiscoveryTask ( X, target, searchspace, result_set_size=5, depth=2, qf=ps.PredictionQFNumeric(a=0.5)) result = ps.BeamSearch().execute(task) result.to_dataframe() roc_auc_score(y[X.Age >= 38.0], y_hat[X.Age >= 38.0]) ############################################################ ## Toy example using the test dataset to generate answers ##
import pysubgroup as ps import pandas as pd import pprint pp = pprint.PrettyPrinter(indent=4) data = pd.read_csv("../data/titanic.csv") search_space = ps.create_selectors(data, ignore="survived") dt = data.dtypes task = ps.SubgroupDiscoveryTask(data, ps.FITarget, search_space, result_set_size=10, depth=5, qf=ps.CountQF()) result = ps.SimpleDFS().execute(task) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroup_description)) task = ps.SubgroupDiscoveryTask(data, ps.FITarget, search_space, result_set_size=10, depth=3, qf=ps.AreaQF()) result = ps.SimpleDFS().execute(task) for (q, sg) in result: