Ejemplo n.º 1
0
 def test_CountQf(self):
     task = ps.SubgroupDiscoveryTask(self.data, ps.FITarget, None, None)
     qf = ps.CountQF()
     qf.calculate_constant_statistics(task)
     sel = ps.EqualitySelector('checking_status', b'no checking')
     print(self.data.columns)
     print(self.data.checking_status.value_counts())
     size = qf.evaluate(sel, self.data)
     self.assertEqual(size, 394)
     size = qf.evaluate(slice(None))
     self.assertEqual(size, len(self.data))
     size = qf.evaluate(slice(0, 10))
     self.assertEqual(size, 10)
     size = qf.evaluate(np.array([1, 3, 5, 7, 11], dtype=int))
     self.assertEqual(size, 5)
Ejemplo n.º 2
0
    def setUp(self):
        NS_cabin = ps.EqualitySelector("Cabin", np.nan)
        NS_embarked = ps.EqualitySelector("Embarked", 'S')
        NS_male = ps.EqualitySelector("Sex", 'male')
        NS_female = ps.EqualitySelector("Sex", 'female')
        #NS_other_parties = ps.EqualitySelector("other_parties", b"none")
        #NS_savings_status = ps.EqualitySelector("savings_status", b"<100")
        #NS_job = ps.EqualitySelector("job", b"skilled")
        self.result = [
            ps.Conjunction([]),
            ps.Conjunction([NS_cabin]),
            ps.Conjunction([NS_embarked]),
            ps.Conjunction([NS_male]),
            ps.Conjunction([NS_cabin, NS_embarked]),
            ps.Conjunction([NS_cabin, NS_male]),
            ps.Conjunction([NS_embarked, NS_male]),
            ps.Conjunction([NS_cabin, NS_embarked, NS_male]),
            ps.Conjunction([NS_female]),
            ps.Conjunction([NS_cabin, NS_female]),
            ps.Conjunction([NS_embarked, NS_female]),
            #                    ps.Conjunction([NS_checking, NS_job]),
        ]

        self.qualities = [156, 125, 110, 100, 89, 82, 73, 60, 56, 43, 37]

        data = get_titanic_data()
        self.qualities2 = [
            np.count_nonzero(conj.covers(data)) for conj in self.result
        ]
        self.assertEqual(self.qualities, self.qualities2)
        searchSpace = ps.create_nominal_selectors(data)
        self.task = ps.SubgroupDiscoveryTask(data,
                                             ps.FITarget,
                                             searchSpace,
                                             result_set_size=10,
                                             depth=5,
                                             qf=ps.CountQF())
 def setUp(self):
     self.qf = ps.CountQF()
     self.ga_qf = ps.GeneralizationAwareQF(self.qf)
     self.prepare_df()
Ejemplo n.º 4
0
import pandas as pd

import pprint

pp = pprint.PrettyPrinter(indent=4)

data = pd.read_csv("~/datasets/titanic.csv")
searchSpace = ps.createSelectors(data, ignore="survived")
dt = data.dtypes

task = ps.SubgroupDiscoveryTask(data,
                                ps.FITarget,
                                searchSpace,
                                resultSetSize=10,
                                depth=5,
                                qf=ps.CountQF())
result = ps.SimpleDFS().execute(task)

for (q, sg) in result:
    print(str(q) + ":\t" + str(sg.subgroupDescription))

task = ps.SubgroupDiscoveryTask(data,
                                ps.FITarget,
                                searchSpace,
                                resultSetSize=10,
                                depth=3,
                                qf=ps.AreaQF())
result = ps.SimpleDFS().execute(task)

for (q, sg) in result:
    print(f"{q}\t{sg.subgroupDescription}\t{sg.count(data)}")
Ejemplo n.º 5
0
if __name__ == '__main__':
    from pysubgroup.tests.DataSets import get_credit_data
    from pysubgroup import model_target

    data = get_credit_data()
    #warnings.filterwarnings("error")
    print(data.columns)
    searchSpace_Nominal = ps.create_nominal_selectors(
        data, ignore=['duration', 'credit_amount'])
    searchSpace_Numeric = ps.create_numeric_selectors(
        data, ignore=['duration', 'credit_amount'])
    searchSpace = searchSpace_Nominal + searchSpace_Numeric
    target = ps.FITarget()
    #QF=model_target.EMM_Likelihood(model_target.PolyRegression_ModelClass(x_name='duration', y_name='credit_amount'))
    QF = ps.CountQF()
    task = ps.SubgroupDiscoveryTask(data,
                                    target,
                                    searchSpace,
                                    result_set_size=200,
                                    depth=4,
                                    qf=QF)
    GpGrowth(mode='b_u').to_file(task, 'E:/tmp/gp_credit.txt')

    import time
    start_time = time.time()
    gp = GpGrowth(mode='b_u').execute(task)
    print("--- %s seconds ---" % (time.time() - start_time))
    #gp = [(qual, sg) for qual, sg in gp if sg.depth <= task.depth]
    gp = sorted(gp)
    quit()