Esempio n. 1
0
 def test_create_selectors_with_nan(self):
     df = pd.DataFrame.from_dict({
         'A': np.array([np.nan, np.nan, np.nan]),
         'B': np.array([10, np.nan, np.nan])
     })
     result = ps.create_selectors(df)
     A_null = ps.EqualitySelector('A', np.nan)
     B_null = ps.EqualitySelector('B', np.nan)
     B_10 = ps.EqualitySelector('B', 10.)
     assert A_null in result
     assert B_null in result
     assert B_10 in result
Esempio n. 2
0
import pandas as pd
import pysubgroup as ps


data = pd.read_table("../data/titanic.csv")
target = ps.BinaryTarget('Survived', 0)
search_space = ps.create_selectors(data, ignore=['Survived'])
task = ps.SubgroupDiscoveryTask(data, target, search_space,
                                result_set_size=5, depth=2,
                                qf=ps.CombinedInterestingnessMeasure([ps.StandardQF(1), ps.GeneralizationAware_StandardQF(1)]))

result = ps.SimpleDFS().execute(task, use_optimistic_estimates=False)

print(result.to_dataframe())
Esempio n. 3
0
from timeit import default_timer as timer
import pysubgroup as ps
from pysubgroup.tests.DataSets import get_credit_data
data = get_credit_data()

print("running")
target = ps.BinaryTarget('class', b'bad')
search_space = ps.create_selectors(data, ignore=['class'])
task = ps.SubgroupDiscoveryTask(data,
                                target,
                                search_space,
                                result_set_size=10,
                                depth=3,
                                qf=ps.ChiSquaredQF(direction="bidirect"))

start = timer()
result = ps.SimpleDFS().execute(task)
end = timer()

print("Time elapsed: ", (end - start))

for (q, sg) in result:
    print(str(q) + ":\t" + str(sg.subgroup_description))

# print WRAccQF().evaluate_from_dataset(data, Subgroup(target, []))
Esempio n. 4
0
from pysubgroup.tests.DataSets import get_titanic_data
data = get_titanic_data()

select = data.columns[data.dtypes.transform(lambda x: True if x in ["float64", "int64"] else False)]
X = data[set(select) - set(["Survived"])]
y = data["Survived"]

y = y.loc[X.isna().sum(axis=1) == 0]
X = X.loc[X.isna().sum(axis=1) == 0][["Age", "Fare", "Pclass"]]
#create simple model to test PredictionQF
RF = rf_classifier.fit(X[["Age", "Fare", "Pclass"]], y)
y_hat = RF.predict_proba(X)[:,1]

target = ps.PredictionTarget(y.to_numpy(), y_hat, roc_auc_score)

searchspace = ps.create_selectors(X[["Age","Fare", "Pclass"]], ignore=['Survived'])
task = ps.SubgroupDiscoveryTask (
    X,
    target,
    searchspace,
    result_set_size=5,
    depth=2,
    qf=ps.PredictionQFNumeric(a=0.5))

result = ps.BeamSearch().execute(task)

result.to_dataframe()
roc_auc_score(y[X.Age >= 38.0], y_hat[X.Age >= 38.0])

############################################################
## Toy example using the test dataset to generate answers ##
Esempio n. 5
0
import pysubgroup as ps
import pandas as pd

import pprint

pp = pprint.PrettyPrinter(indent=4)

data = pd.read_csv("../data/titanic.csv")
search_space = ps.create_selectors(data, ignore="survived")
dt = data.dtypes

task = ps.SubgroupDiscoveryTask(data,
                                ps.FITarget,
                                search_space,
                                result_set_size=10,
                                depth=5,
                                qf=ps.CountQF())
result = ps.SimpleDFS().execute(task)

for (q, sg) in result:
    print(str(q) + ":\t" + str(sg.subgroup_description))

task = ps.SubgroupDiscoveryTask(data,
                                ps.FITarget,
                                search_space,
                                result_set_size=10,
                                depth=3,
                                qf=ps.AreaQF())
result = ps.SimpleDFS().execute(task)

for (q, sg) in result: