def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_savings_status = ps.EqualitySelector("savings_status", b"<100") NS_job = ps.EqualitySelector("job", b"skilled") self.result = [ps.Conjunction([NS_checking, NS_foreign_worker]), ps.Conjunction([NS_checking]), ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_other_parties]), ps.Conjunction([NS_checking, NS_savings_status, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_savings_status]), ps.Conjunction([NS_checking, NS_savings_status, NS_other_parties, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_job, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_savings_status, NS_other_parties]), ps.Conjunction([NS_checking, NS_job]), ] self.qualities = [0.055299999999999995, 0.05280000000000001, 0.052300000000000006, 0.05059999999999999, 0.04959999999999999, 0.048299999999999996, 0.04660000000000001, 0.04550000000000001, 0.0452, 0.044399999999999995] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace = ps.create_nominal_selectors(data, ignore=['class']) self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=10, depth=5, qf=ps.StandardQF(1.0))
def setUp(self): #NS_all = ps.EqualitySelector(True) NS_payment = ps.EqualitySelector("other_payment_plans",b"none") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_housing = ps.EqualitySelector("housing", b'own') NS_class = ps.EqualitySelector("class", b"good") DFSo = [[NS_foreign_worker], [NS_other_parties], [NS_foreign_worker, NS_other_parties], [NS_payment], [NS_foreign_worker, NS_payment], [NS_other_parties, NS_payment], [NS_housing], [NS_class], [NS_foreign_worker, NS_other_parties, NS_payment]] self.DFSresult = list(map(ps.Conjunction, DFSo)) self.DFSresult.insert(0,True) self.DFSqualities = [500.4980179286455, 483.3153195123844, 459.2862838915471, 444.60343785358896, 398.25539855072464, 384.0460358056267, 362.090608537693, 355.0749649843413, 355.010575658835, 349.8188702669149] o = [[NS_foreign_worker], [NS_other_parties], [NS_foreign_worker, NS_other_parties], [NS_payment], [NS_foreign_worker, NS_payment], [NS_other_parties, NS_payment], [NS_housing], [NS_class], [NS_foreign_worker, NS_other_parties, NS_payment], [NS_foreign_worker, NS_housing]] self.result = list(map(ps.Conjunction, o)) self.qualities = [483.3153195123844, 459.2862838915471, 444.60343785358896, 398.25539855072464, 384.0460358056267, 362.090608537693, 355.0749649843413, 355.010575658835, 349.8188702669149, 342.20780439530444] np.random.seed(1111) self.target_variables = np.random.randint(low=0, high=2, size=1000) self.target_estimates = np.random.uniform(size=1000) data = get_credit_data() target = ps.PredictionTarget(self.target_variables, self.target_estimates, roc_auc_score) searchSpace_Nominal = ps.create_nominal_selectors(data, ignore=['credit_amount']) searchSpace_Numeric = [] #ps.create_numeric_selectors(data, ignore=['credit_amount'], nbins=10) searchSpace = searchSpace_Nominal + searchSpace_Numeric self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=10, depth=5, qf=ps.CountCallsInterestingMeasure(ps.PredictionQFNumeric(1, False)))
def setUpClass(cls): data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace = ps.create_nominal_selectors(data, ignore=['class']) cls.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=10, depth=5, qf=ps.StandardQF(1.0)) cls.result = ps.SimpleDFS().execute(cls.task)
def test_CountQf(self): task = ps.SubgroupDiscoveryTask(self.data, ps.FITarget, None, None) qf = ps.CountQF() qf.calculate_constant_statistics(task) sel = ps.EqualitySelector('checking_status', b'no checking') print(self.data.columns) print(self.data.checking_status.value_counts()) size = qf.evaluate(sel, self.data) self.assertEqual(size, 394) size = qf.evaluate(slice(None)) self.assertEqual(size, len(self.data)) size = qf.evaluate(slice(0, 10)) self.assertEqual(size, 10) size = qf.evaluate(np.array([1, 3, 5, 7, 11], dtype=int)) self.assertEqual(size, 5)
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_checking2 = ps.EqualitySelector("checking_status", b"0<=X<200") NS_other_parties = ps.EqualitySelector("other_parties", b"co applicant") NS_other = ps.EqualitySelector("purpose", b'other') NS_repairs = ps.EqualitySelector("purpose", b'repairs') NS_purpose = ps.EqualitySelector("purpose", b'business') NS_history = ps.EqualitySelector("credit_history", b"no credits/all paid") NS_history2 = ps.EqualitySelector("credit_history", b"all paid") NS_empl = ps.EqualitySelector("employment", b"unemployed") NS_job = ps.EqualitySelector("job", b"unemp/unskilled non res") NS_bank = ps.EqualitySelector("other_payment_plans", b"bank") self.result = [ ps.Disjunction([NS_checking, NS_checking2, NS_bank]), ps.Disjunction([NS_checking, NS_checking2, NS_history]), ps.Disjunction([NS_checking, NS_checking2]), ps.Disjunction([NS_checking, NS_checking2, NS_other]), ps.Disjunction([NS_checking, NS_checking2, NS_repairs]), ps.Disjunction([NS_checking, NS_checking2, NS_empl]), ps.Disjunction([NS_checking, NS_checking2, NS_other_parties]), ps.Disjunction([NS_checking, NS_checking2, NS_history2]), ps.Disjunction([NS_checking, NS_checking2, NS_purpose]), ps.Disjunction([NS_checking, NS_checking2, NS_job]), ] self.qualities = [ 0.0779, 0.07740000000000002, 0.0771, 0.07680000000000001, 0.07670000000000002, 0.0767, 0.07660000000000003, 0.07650000000000003, 0.07650000000000001, 0.07600000000000001 ] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace = ps.create_nominal_selectors(data, ignore=['class']) self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=10, depth=3, qf=ps.StandardQF(1.0))
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_savings_status = ps.EqualitySelector("savings_status", b"<100") NS_job = ps.EqualitySelector("job", b"skilled") NS_dependents = ps.EqualitySelector("num_dependents", 1.0) self.result = [ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_other_parties, NS_savings_status]), # AND job=='b'skilled'' AND other_parties=='b'none'' AND savings_status=='b'<100' # 0.113713540226172: checking_status=='b'<0'' AND foreign_worker=='b'yes'' AND job=='b'skilled'' AND savings_status=='b'<100'' ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_job]), # checking_status=='b'<0'' AND foreign_worker=='b'yes'' AND job=='b'skilled'' # checking_status=='b'<0'' AND job=='b'skilled'' AND other_parties=='b'none'' AND savings_status=='b'<100'' ps.Conjunction([NS_checking, NS_job, NS_other_parties, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_other_parties]), ps.Conjunction([NS_checking, NS_job, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_other_parties, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_other_parties]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_dependents, NS_savings_status]), ps.Conjunction([NS_checking, NS_job, NS_other_parties])] self.qualities = [0.11457431093955019, 0.113713540226172, 0.11201325679119281, 0.1117538749727658, 0.11161046793076415, 0.11145710640046322, 0.11045259291161472, 0.10929088624672183, 0.10875519439407161, 0.10866138825404954, 0.10832735026213287, 0.10813405094128754] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace_Nominal = ps.create_nominal_selectors(data, ignore=['class']) searchSpace_Numeric = ps.create_numeric_selectors(data, ignore=['class']) searchSpace = searchSpace_Nominal + searchSpace_Numeric self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=12, depth=5, qf=ps.StandardQF(0.5))
def setUp(self): NS_telephone = ps.EqualitySelector("own_telephone", b"yes") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_personal = ps.EqualitySelector("personal_status", b'male single') NS_job = ps.EqualitySelector("job", b'high qualif/self emp/mgmt') NS_class = ps.EqualitySelector("class", b"bad") o = [[NS_telephone], [NS_foreign_worker, NS_telephone], [NS_other_parties, NS_telephone], [NS_foreign_worker, NS_telephone, NS_personal], [NS_telephone, NS_personal], [NS_foreign_worker, NS_other_parties, NS_telephone], [NS_job], [NS_class, NS_telephone], [NS_foreign_worker, NS_job], [NS_foreign_worker, NS_other_parties, NS_telephone, NS_personal]] self.result = list(map(ps.Conjunction, o)) self.qualities = [ 383476.7679999999, 361710.05800000014, 345352.9920000001, 338205.08, 336857.8220000001, 323586.28200000006, 320306.81600000005, 300963.84599999996, 299447.332, 297422.98200000013 ] data = get_credit_data() target = ps.NumericTarget('credit_amount') searchSpace_Nominal = ps.create_nominal_selectors( data, ignore=['credit_amount']) searchSpace_Numeric = [ ] #ps.create_numeric_selectors(data, ignore=['credit_amount'], nbins=10) searchSpace = searchSpace_Nominal + searchSpace_Numeric self.task = ps.SubgroupDiscoveryTask( data, target, searchSpace, result_set_size=10, depth=5, qf=ps.CountCallsInterestingMeasure( ps.StandardQFNumeric(1, False, 'sum')))
def setUp(self): NS_cabin = ps.EqualitySelector("Cabin", np.nan) NS_embarked = ps.EqualitySelector("Embarked", 'S') NS_embarked2 = ps.EqualitySelector("Embarked", 'C') NS_male = ps.EqualitySelector("Sex", 'male') NS_female = ps.EqualitySelector("Sex", 'female') #NS_other_parties = ps.EqualitySelector("other_parties", b"none") #NS_savings_status = ps.EqualitySelector("savings_status", b"<100") #NS_job = ps.EqualitySelector("job", b"skilled") self.result = [ ps.Conjunction([NS_cabin, NS_embarked]), ps.Conjunction([NS_cabin, NS_male]), ps.Conjunction([NS_embarked, NS_male]), ps.Conjunction([NS_cabin]), ps.Conjunction([NS_embarked]), ps.Conjunction([NS_male]), ps.Conjunction([NS_cabin, NS_female]), ps.Conjunction([NS_embarked, NS_female]), ps.Conjunction([NS_female]), ps.Conjunction([NS_cabin, NS_embarked2]), ] self.qualities = [178, 164, 146, 125, 110, 100, 86, 74, 56, 46] data = get_titanic_data() self.qualities2 = [ np.count_nonzero(conj.covers(data)) * conj.depth for conj in self.result ] self.assertEqual(self.qualities, self.qualities2) searchSpace = ps.create_nominal_selectors(data) self.task = ps.SubgroupDiscoveryTask(data, ps.FITarget, searchSpace, result_set_size=10, depth=2, qf=ps.AreaQF())
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_savings_status = ps.EqualitySelector("savings_status", b"<100") NS_payment_plans = ps.EqualitySelector("other_payment_plans", b"none") self.result = [ ps.Conjunction([NS_checking, NS_foreign_worker]), ps.Conjunction([NS_checking]), ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_other_parties]), ps.Conjunction([NS_checking, NS_savings_status, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_payment_plans]), ps.Conjunction([NS_checking, NS_payment_plans]), ps.Conjunction([NS_foreign_worker, NS_savings_status]), ps.Conjunction( [NS_foreign_worker, NS_other_parties, NS_savings_status]), ] self.qualities = [ 0.055299999999999995, 0.05280000000000001, 0.052300000000000006, 0.05059999999999999, 0.04959999999999999, 0.048299999999999996, 0.0426, 0.04, 0.03869999999999999, 0.03750000000000001 ] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace = ps.create_nominal_selectors(data, ignore=['class']) self.task = ps.SubgroupDiscoveryTask( data, target, searchSpace, result_set_size=10, depth=5, qf=ps.StandardQF(1.0), constraints=[ps.MinSupportConstraint(200)])
import pandas as pd import pysubgroup as ps data = pd.read_table("../data/titanic.csv") target = ps.BinaryTarget('Survived', 0) search_space = ps.create_selectors(data, ignore=['Survived']) task = ps.SubgroupDiscoveryTask(data, target, search_space, result_set_size=5, depth=2, qf=ps.CombinedInterestingnessMeasure([ps.StandardQF(1), ps.GeneralizationAware_StandardQF(1)])) result = ps.SimpleDFS().execute(task, use_optimistic_estimates=False) print(result.to_dataframe())
from scipy.io import arff import pysubgroup as ps import pandas as pd data = pd.DataFrame(arff.loadarff("../data/credit-g.arff")[0]) target = ps.NominalTarget('class', b'bad') searchSpace = ps.createNominalSelectors(data, ignore=['class']) task = ps.SubgroupDiscoveryTask(data, target, searchSpace, resultSetSize=10, depth=3, qf=ps.StandardQF(1.0)) result = ps.BeamSearch(beamWidth=10).execute(task) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroupDescription)) print("******") result = ps.SimpleDFS().execute(task) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroupDescription)) # print WRAccQF().evaluateFromDataset(data, Subgroup(target, []))
import pysubgroup as ps import pandas as pd from scipy.io import arff from timeit import default_timer as timer data = pd.DataFrame(arff.loadarff("../data/credit-g.arff")[0]) target = ps.NumericTarget('credit_amount') searchSpace = ps.createNominalSelectors(data, ignore=['credit_amount']) task = ps.SubgroupDiscoveryTask(data, target, searchSpace, resultSetSize=10, depth=5, qf=ps.StandardQF_numeric(1, False)) print(searchSpace) start = timer() result = ps.BestFirstSearch().execute(task) end = timer() print(f"Time elapsed: {end - start}") for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroupDescription))
from scipy.io import arff import pysubgroup as ps import pandas as pd from timeit import default_timer as timer data = pd.DataFrame(arff.loadarff("../data/credit-g.arff")[0]) target = ps.NominalTarget('class', b'bad') search_space = ps.create_nominal_selectors(data, ignore=['class']) task = ps.SubgroupDiscoveryTask(data, target, search_space, result_set_size=10, depth=5, qf=ps.StandardQF(0.5)) #start = timer() #result = ps.BSD_Bitarray().execute(task) #end = timer() #print("Time elapsed: ", (end - start)) #for (q, sg) in result: # print (str(q) + ":\t" + str(sg.subgroup_description)) # print WRAccQF().evaluate_from_dataset(data, Subgroup(target, [])) start = timer() result = ps.BSD().execute(task) end = timer() print("Time elapsed: ", (end - start)) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroup_description))
from timeit import default_timer as timer import pysubgroup as ps from pysubgroup.tests.DataSets import get_credit_data data = get_credit_data() print("running") target = ps.BinaryTarget('class', b'bad') search_space = ps.create_selectors(data, ignore=['class']) task = ps.SubgroupDiscoveryTask(data, target, search_space, result_set_size=10, depth=3, qf=ps.ChiSquaredQF(direction="bidirect")) start = timer() result = ps.SimpleDFS().execute(task) end = timer() print("Time elapsed: ", (end - start)) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroup_description)) # print WRAccQF().evaluate_from_dataset(data, Subgroup(target, []))
import pysubgroup as ps import pandas as pd data = pd.read_table("../data/titanic.csv") target = ps.NominalTarget('Survived', True) searchspace = ps.create_selectors(data, ignore=['Survived']) task = ps.SubgroupDiscoveryTask(data, target, searchspace, result_set_size=5, depth=2, qf=ps.ChiSquaredQF()) result = ps.BeamSearch().execute(task) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroup_description))
X = data[set(select) - set(["Survived"])] y = data["Survived"] y = y.loc[X.isna().sum(axis=1) == 0] X = X.loc[X.isna().sum(axis=1) == 0][["Age", "Fare", "Pclass"]] #create simple model to test PredictionQF RF = rf_classifier.fit(X[["Age", "Fare", "Pclass"]], y) y_hat = RF.predict_proba(X)[:,1] target = ps.PredictionTarget(y.to_numpy(), y_hat, roc_auc_score) searchspace = ps.create_selectors(X[["Age","Fare", "Pclass"]], ignore=['Survived']) task = ps.SubgroupDiscoveryTask ( X, target, searchspace, result_set_size=5, depth=2, qf=ps.PredictionQFNumeric(a=0.5)) result = ps.BeamSearch().execute(task) result.to_dataframe() roc_auc_score(y[X.Age >= 38.0], y_hat[X.Age >= 38.0]) ############################################################ ## Toy example using the test dataset to generate answers ## ############################################################ from pysubgroup.tests.DataSets import get_credit_data
import pysubgroup as ps import pandas as pd from scipy.io import arff from timeit import default_timer as timer data = pd.DataFrame(arff.loadarff("../data/credit-g.arff")[0]) target = ps.NumericTarget('credit_amount') search_space = ps.create_nominal_selectors(data, ignore=['credit_amount']) task = ps.SubgroupDiscoveryTask(data, target, search_space, result_set_size=10, depth=3, qf=ps.StandardQFNumeric(1, False)) print(search_space) start = timer() result = ps.SimpleDFS().execute(task) end = timer() print(f"Time elapsed: {end - start}") for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroup_description))
from pysubgroup import model_target data = get_credit_data() #warnings.filterwarnings("error") print(data.columns) searchSpace_Nominal = ps.create_nominal_selectors( data, ignore=['duration', 'credit_amount']) searchSpace_Numeric = ps.create_numeric_selectors( data, ignore=['duration', 'credit_amount']) searchSpace = searchSpace_Nominal + searchSpace_Numeric target = ps.FITarget() #QF=model_target.EMM_Likelihood(model_target.PolyRegression_ModelClass(x_name='duration', y_name='credit_amount')) QF = ps.CountQF() task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=200, depth=4, qf=QF) GpGrowth(mode='b_u').to_file(task, 'E:/tmp/gp_credit.txt') import time start_time = time.time() gp = GpGrowth(mode='b_u').execute(task) print("--- %s seconds ---" % (time.time() - start_time)) #gp = [(qual, sg) for qual, sg in gp if sg.depth <= task.depth] gp = sorted(gp) quit() start_time = time.time() dfs1 = ps.SimpleDFS().execute(task) print("--- %s seconds ---" % (time.time() - start_time))
from scipy.io import arff import pysubgroup as ps import pandas as pd from timeit import default_timer as timer data = pd.DataFrame (arff.loadarff("../data/credit-g.arff") [0]) target = ps.NominalTarget('class', b'bad') searchSpace = ps.createNominalSelectors(data, ignore=['class']) task = ps.SubgroupDiscoveryTask (data, target, searchSpace, resultSetSize=10, depth=5, qf=ps.ChiSquaredQF()) start = timer() result = ps.BSD().execute(task) end = timer() print("Time elapsed: ", (end - start)) for (q, sg) in result: print (str(q) + ":\t" + str(sg.subgroupDescription)) print ("******") start = timer() result = ps.TID_SD().execute(task) end = timer() print("Time elapsed: ", (end - start)) for (q, sg) in result: print (str(q) + ":\t" + str(sg.subgroupDescription))
import pysubgroup as ps import pandas as pd import pprint pp = pprint.PrettyPrinter(indent=4) data = pd.read_csv("~/datasets/titanic.csv") searchSpace = ps.createSelectors(data, ignore="survived") dt = data.dtypes task = ps.SubgroupDiscoveryTask(data, ps.FITarget, searchSpace, resultSetSize=10, depth=5, qf=ps.CountQF()) result = ps.SimpleDFS().execute(task) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroupDescription)) task = ps.SubgroupDiscoveryTask(data, ps.FITarget, searchSpace, resultSetSize=10, depth=3, qf=ps.AreaQF()) result = ps.SimpleDFS().execute(task) for (q, sg) in result:
import pysubgroup as ps import pandas as pd data = pd.read_csv("~/datasets/titanic.csv") target = ps.NominalTarget('survived', 0) searchSpace = ps.createSelectors(data, ignore=['survived']) task = ps.SubgroupDiscoveryTask(data, target, searchSpace, resultSetSize=5, depth=2, qf=ps.CombinedInterestingnessMeasure( [ps.StandardQF(1), ps.GAStandardQF(1)])) result = ps.SimpleDFS().execute(task, useOptimisticEstimates=False) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroupDescription))