def test_CountQf(self): task = ps.SubgroupDiscoveryTask(self.data, ps.FITarget, None, None) qf = ps.CountQF() qf.calculate_constant_statistics(task) sel = ps.EqualitySelector('checking_status', b'no checking') print(self.data.columns) print(self.data.checking_status.value_counts()) size = qf.evaluate(sel, self.data) self.assertEqual(size, 394) size = qf.evaluate(slice(None)) self.assertEqual(size, len(self.data)) size = qf.evaluate(slice(0, 10)) self.assertEqual(size, 10) size = qf.evaluate(np.array([1, 3, 5, 7, 11], dtype=int)) self.assertEqual(size, 5)
def setUp(self): NS_cabin = ps.EqualitySelector("Cabin", np.nan) NS_embarked = ps.EqualitySelector("Embarked", 'S') NS_male = ps.EqualitySelector("Sex", 'male') NS_female = ps.EqualitySelector("Sex", 'female') #NS_other_parties = ps.EqualitySelector("other_parties", b"none") #NS_savings_status = ps.EqualitySelector("savings_status", b"<100") #NS_job = ps.EqualitySelector("job", b"skilled") self.result = [ ps.Conjunction([]), ps.Conjunction([NS_cabin]), ps.Conjunction([NS_embarked]), ps.Conjunction([NS_male]), ps.Conjunction([NS_cabin, NS_embarked]), ps.Conjunction([NS_cabin, NS_male]), ps.Conjunction([NS_embarked, NS_male]), ps.Conjunction([NS_cabin, NS_embarked, NS_male]), ps.Conjunction([NS_female]), ps.Conjunction([NS_cabin, NS_female]), ps.Conjunction([NS_embarked, NS_female]), # ps.Conjunction([NS_checking, NS_job]), ] self.qualities = [156, 125, 110, 100, 89, 82, 73, 60, 56, 43, 37] data = get_titanic_data() self.qualities2 = [ np.count_nonzero(conj.covers(data)) for conj in self.result ] self.assertEqual(self.qualities, self.qualities2) searchSpace = ps.create_nominal_selectors(data) self.task = ps.SubgroupDiscoveryTask(data, ps.FITarget, searchSpace, result_set_size=10, depth=5, qf=ps.CountQF())
def setUp(self): self.qf = ps.CountQF() self.ga_qf = ps.GeneralizationAwareQF(self.qf) self.prepare_df()
import pandas as pd import pprint pp = pprint.PrettyPrinter(indent=4) data = pd.read_csv("~/datasets/titanic.csv") searchSpace = ps.createSelectors(data, ignore="survived") dt = data.dtypes task = ps.SubgroupDiscoveryTask(data, ps.FITarget, searchSpace, resultSetSize=10, depth=5, qf=ps.CountQF()) result = ps.SimpleDFS().execute(task) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroupDescription)) task = ps.SubgroupDiscoveryTask(data, ps.FITarget, searchSpace, resultSetSize=10, depth=3, qf=ps.AreaQF()) result = ps.SimpleDFS().execute(task) for (q, sg) in result: print(f"{q}\t{sg.subgroupDescription}\t{sg.count(data)}")
if __name__ == '__main__': from pysubgroup.tests.DataSets import get_credit_data from pysubgroup import model_target data = get_credit_data() #warnings.filterwarnings("error") print(data.columns) searchSpace_Nominal = ps.create_nominal_selectors( data, ignore=['duration', 'credit_amount']) searchSpace_Numeric = ps.create_numeric_selectors( data, ignore=['duration', 'credit_amount']) searchSpace = searchSpace_Nominal + searchSpace_Numeric target = ps.FITarget() #QF=model_target.EMM_Likelihood(model_target.PolyRegression_ModelClass(x_name='duration', y_name='credit_amount')) QF = ps.CountQF() task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=200, depth=4, qf=QF) GpGrowth(mode='b_u').to_file(task, 'E:/tmp/gp_credit.txt') import time start_time = time.time() gp = GpGrowth(mode='b_u').execute(task) print("--- %s seconds ---" % (time.time() - start_time)) #gp = [(qual, sg) for qual, sg in gp if sg.depth <= task.depth] gp = sorted(gp) quit()