def plot_roc(result_df, data, qf=ps.StandardQF(0.5), levels=40, annotate=False): instances_dataset = len(data) positives_dataset = np.max(result_df['positives_dataset']) negatives_dataset = instances_dataset - positives_dataset xlist = np.linspace(0.01, 0.99, 100) ylist = np.linspace(0.01, 0.99, 100) X, Y = np.meshgrid(xlist, ylist) f = np.vectorize(partial(qf.evaluate, instances_dataset, positives_dataset), otypes=[np.float]) Z = f(X * negatives_dataset + Y * positives_dataset, Y * positives_dataset) max_val = np.max([np.max(Z), -np.min(Z)]) fig, ax = plt.subplots() cm = plt.cm.get_cmap("bwr") plt.contourf(X, Y, Z, levels, cmap=cm, vmin=-max_val, vmax=max_val) for i, sg in result_df.iterrows(): rel_positives_sg = sg['positives_sg'] / positives_dataset rel_negatives_sg = (sg['size_sg'] - sg['positives_sg']) / negatives_dataset ax.plot(rel_negatives_sg, rel_positives_sg, 'ro', color='black') if annotate: label_margin = 0.01 ax.annotate(str(i), (rel_negatives_sg + label_margin, rel_positives_sg + label_margin)) # plt.colorbar(cp) plt.title('Discovered subgroups') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') return fig
def test_simple(self): task = task_dummy(self.df, ps.BinaryTarget('columnC', 1)) qf = ps.StandardQF(0) qf.calculate_constant_statistics(task) self.ga_qf.calculate_constant_statistics(task) #print(qf.calculate_statistics(self.A1, self.df)) #print(qf.calculate_statistics(self.BA, self.df)) #print(qf.calculate_statistics(ps.Conjunction([self.A1, self.BA]), self.df)) #print(qf.calculate_statistics(slice(None), self.df)) ga_stat = self.ga_qf.calculate_statistics( ps.Conjunction([self.A1, self.BA]), self.df) self.assertEqual(ga_stat.subgroup_stats, ps.SimplePositivesQF.tpl(3, 2)) self.assertEqual(ga_stat.generalisation_stats, ps.SimplePositivesQF.tpl(5, 3)) # Ensure cache works properly self.assertEqual( ga_stat, self.ga_qf.calculate_statistics(ps.Conjunction([self.A1, self.BA]), self.df)) ga_score = self.ga_qf.evaluate(ps.Conjunction([self.A1, self.BA]), self.df) ga_score2 = self.ga_qf.evaluate(ps.Conjunction([self.A1, self.BA]), self.df) self.assertEqual(ga_score, ga_score2) self.assertAlmostEqual(ga_score, 0.06666666666666)
def setUpClass(cls): data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace = ps.create_nominal_selectors(data, ignore=['class']) cls.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=10, depth=5, qf=ps.StandardQF(1.0)) cls.result = ps.SimpleDFS().execute(cls.task)
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_savings_status = ps.EqualitySelector("savings_status", b"<100") NS_job = ps.EqualitySelector("job", b"skilled") NS_dependents = ps.EqualitySelector("num_dependents", 1.0) self.result = [ ps.Conjunction([NS_checking, NS_foreign_worker]), ps.Conjunction([NS_checking]), ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_other_parties]), ps.Conjunction([NS_checking, NS_savings_status, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_dependents]), ps.Conjunction([NS_checking, NS_savings_status]), ps.Conjunction([NS_checking, NS_dependents]), ps.Conjunction([ NS_checking, NS_savings_status, NS_other_parties, NS_foreign_worker ]), ps.Conjunction([NS_checking, NS_job, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_savings_status, NS_other_parties]), ps.Conjunction([NS_checking, NS_job]), ] self.qualities = [ 0.055299999999999995, 0.05280000000000001, 0.052300000000000006, 0.05059999999999999, 0.04959999999999999, 0.04870000000000001, 0.048299999999999996, 0.0474, 0.04660000000000001, 0.04550000000000001, 0.0452, 0.044399999999999995 ] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace_Nominal = ps.create_nominal_selectors(data, ignore=['class']) searchSpace_Numeric = ps.create_numeric_selectors(data, ignore=['class']) searchSpace = searchSpace_Nominal + searchSpace_Numeric self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=12, depth=5, qf=ps.StandardQF(1.0))
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_checking2 = ps.EqualitySelector("checking_status", b"0<=X<200") NS_other_parties = ps.EqualitySelector("other_parties", b"co applicant") NS_other = ps.EqualitySelector("purpose", b'other') NS_repairs = ps.EqualitySelector("purpose", b'repairs') NS_purpose = ps.EqualitySelector("purpose", b'business') NS_history = ps.EqualitySelector("credit_history", b"no credits/all paid") NS_history2 = ps.EqualitySelector("credit_history", b"all paid") NS_empl = ps.EqualitySelector("employment", b"unemployed") NS_job = ps.EqualitySelector("job", b"unemp/unskilled non res") NS_bank = ps.EqualitySelector("other_payment_plans", b"bank") self.result = [ ps.Disjunction([NS_checking, NS_checking2, NS_bank]), ps.Disjunction([NS_checking, NS_checking2, NS_history]), ps.Disjunction([NS_checking, NS_checking2]), ps.Disjunction([NS_checking, NS_checking2, NS_other]), ps.Disjunction([NS_checking, NS_checking2, NS_repairs]), ps.Disjunction([NS_checking, NS_checking2, NS_empl]), ps.Disjunction([NS_checking, NS_checking2, NS_other_parties]), ps.Disjunction([NS_checking, NS_checking2, NS_history2]), ps.Disjunction([NS_checking, NS_checking2, NS_purpose]), ps.Disjunction([NS_checking, NS_checking2, NS_job]), ] self.qualities = [ 0.0779, 0.07740000000000002, 0.0771, 0.07680000000000001, 0.07670000000000002, 0.0767, 0.07660000000000003, 0.07650000000000003, 0.07650000000000001, 0.07600000000000001 ] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace = ps.create_nominal_selectors(data, ignore=['class']) self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=10, depth=3, qf=ps.StandardQF(1.0))
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_savings_status = ps.EqualitySelector("savings_status", b"<100") NS_payment_plans = ps.EqualitySelector("other_payment_plans", b"none") self.result = [ ps.Conjunction([NS_checking, NS_foreign_worker]), ps.Conjunction([NS_checking]), ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_other_parties]), ps.Conjunction([NS_checking, NS_savings_status, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_payment_plans]), ps.Conjunction([NS_checking, NS_payment_plans]), ps.Conjunction([NS_foreign_worker, NS_savings_status]), ps.Conjunction( [NS_foreign_worker, NS_other_parties, NS_savings_status]), ] self.qualities = [ 0.055299999999999995, 0.05280000000000001, 0.052300000000000006, 0.05059999999999999, 0.04959999999999999, 0.048299999999999996, 0.0426, 0.04, 0.03869999999999999, 0.03750000000000001 ] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace = ps.create_nominal_selectors(data, ignore=['class']) self.task = ps.SubgroupDiscoveryTask( data, target, searchSpace, result_set_size=10, depth=5, qf=ps.StandardQF(1.0), constraints=[ps.MinSupportConstraint(200)])
import pandas as pd import pysubgroup as ps data = pd.read_table("../data/titanic.csv") target = ps.BinaryTarget('Survived', 0) search_space = ps.create_selectors(data, ignore=['Survived']) task = ps.SubgroupDiscoveryTask(data, target, search_space, result_set_size=5, depth=2, qf=ps.CombinedInterestingnessMeasure([ps.StandardQF(1), ps.GeneralizationAware_StandardQF(1)])) result = ps.SimpleDFS().execute(task, use_optimistic_estimates=False) print(result.to_dataframe())
from scipy.io import arff import pysubgroup as ps import pandas as pd data = pd.DataFrame(arff.loadarff("../data/credit-g.arff")[0]) target = ps.NominalTarget('class', b'bad') searchSpace = ps.createNominalSelectors(data, ignore=['class']) task = ps.SubgroupDiscoveryTask(data, target, searchSpace, resultSetSize=10, depth=3, qf=ps.StandardQF(1.0)) result = ps.BeamSearch(beamWidth=10).execute(task) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroupDescription)) print("******") result = ps.SimpleDFS().execute(task) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroupDescription)) # print WRAccQF().evaluateFromDataset(data, Subgroup(target, []))
from scipy.io import arff import pysubgroup as ps import pandas as pd from timeit import default_timer as timer data = pd.DataFrame(arff.loadarff("../data/credit-g.arff")[0]) target = ps.NominalTarget('class', b'bad') search_space = ps.create_nominal_selectors(data, ignore=['class']) task = ps.SubgroupDiscoveryTask(data, target, search_space, result_set_size=10, depth=5, qf=ps.StandardQF(0.5)) #start = timer() #result = ps.BSD_Bitarray().execute(task) #end = timer() #print("Time elapsed: ", (end - start)) #for (q, sg) in result: # print (str(q) + ":\t" + str(sg.subgroup_description)) # print WRAccQF().evaluate_from_dataset(data, Subgroup(target, [])) start = timer() result = ps.BSD().execute(task) end = timer() print("Time elapsed: ", (end - start)) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroup_description))
import pandas as pd import pysubgroup as ps data = pd.read_table("../data/titanic.csv") target = ps.BinaryTarget('Survived', 0) search_space = ps.create_selectors(data, ignore=['Survived']) task = ps.SubgroupDiscoveryTask(data, target, search_space, result_set_size=5, depth=2, qf=ps.CombinedInterestingnessMeasure( [ps.StandardQF(1), ps.GAStandardQF(1)])) result = ps.SimpleDFS().execute(task, use_optimistic_estimates=False) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroup_description))
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_savings_status = ps.EqualitySelector("savings_status", b"<100") NS_job = ps.EqualitySelector("job", b"skilled") NS_dependents = ps.EqualitySelector("num_dependents", 1.0) self.result = [ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_other_parties, NS_savings_status]), # AND job=='b'skilled'' AND other_parties=='b'none'' AND savings_status=='b'<100' # 0.113713540226172: checking_status=='b'<0'' AND foreign_worker=='b'yes'' AND job=='b'skilled'' AND savings_status=='b'<100'' ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_job]), # checking_status=='b'<0'' AND foreign_worker=='b'yes'' AND job=='b'skilled'' # checking_status=='b'<0'' AND job=='b'skilled'' AND other_parties=='b'none'' AND savings_status=='b'<100'' ps.Conjunction([NS_checking, NS_job, NS_other_parties, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_other_parties]), ps.Conjunction([NS_checking, NS_job, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_other_parties, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_other_parties]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_dependents, NS_savings_status]), ps.Conjunction([NS_checking, NS_job, NS_other_parties])] self.qualities = [0.11457431093955019, 0.113713540226172, 0.11201325679119281, 0.1117538749727658, 0.11161046793076415, 0.11145710640046322, 0.11045259291161472, 0.10929088624672183, 0.10875519439407161, 0.10866138825404954, 0.10832735026213287, 0.10813405094128754] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace_Nominal = ps.create_nominal_selectors(data, ignore=['class']) searchSpace_Numeric = ps.create_numeric_selectors(data, ignore=['class']) searchSpace = searchSpace_Nominal + searchSpace_Numeric self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=12, depth=5, qf=ps.StandardQF(0.5))
import pysubgroup as ps import pandas as pd data = pd.read_csv("~/datasets/titanic.csv") target = ps.NominalTarget('survived', True) searchSpace = ps.createSelectors(data, ignore=['survived']) task = ps.SubgroupDiscoveryTask(data, target, searchSpace, resultSetSize=5, depth=2, qf=ps.StandardQF(1)) result = ps.BeamSearch().execute(task) for (q, sg) in result: print(str(q) + ":\t" + str(sg.subgroupDescription))