def test_simple(self): task = task_dummy(self.df, ps.BinaryTarget('columnC', 1)) qf = ps.StandardQF(0) qf.calculate_constant_statistics(task) self.ga_qf.calculate_constant_statistics(task) #print(qf.calculate_statistics(self.A1, self.df)) #print(qf.calculate_statistics(self.BA, self.df)) #print(qf.calculate_statistics(ps.Conjunction([self.A1, self.BA]), self.df)) #print(qf.calculate_statistics(slice(None), self.df)) ga_stat = self.ga_qf.calculate_statistics( ps.Conjunction([self.A1, self.BA]), self.df) self.assertEqual(ga_stat.subgroup_stats, ps.SimplePositivesQF.tpl(3, 2)) self.assertEqual(ga_stat.generalisation_stats, ps.SimplePositivesQF.tpl(5, 3)) # Ensure cache works properly self.assertEqual( ga_stat, self.ga_qf.calculate_statistics(ps.Conjunction([self.A1, self.BA]), self.df)) ga_score = self.ga_qf.evaluate(ps.Conjunction([self.A1, self.BA]), self.df) ga_score2 = self.ga_qf.evaluate(ps.Conjunction([self.A1, self.BA]), self.df) self.assertEqual(ga_score, ga_score2) self.assertAlmostEqual(ga_score, 0.06666666666666)
def test_DNF(self): A1 = ps.EqualitySelector("A1", 1) A2 = ps.EqualitySelector("A2", 1, "AA") B1 = ps.EqualitySelector("B1", 1) B2 = ps.EqualitySelector("B2", "1") dnf1 = ps.DNF() dnf1.append_or([A1, A2]) dnf2 = ps.DNF([A1, A2]) self.assertTrue(dnf1 == dnf2) dnf3 = ps.DNF(ps.Conjunction([A1, A2])) dnf4 = ps.DNF() dnf4.append_and([A1, A2]) dnf5 = ps.DNF() dnf5.append_and(A1) dnf5.append_and(A2) self.assertTrue(dnf3 == dnf4) self.assertTrue(dnf4 == dnf5) dnf6 = ps.DNF([]) dnf6.append_and([B1, B2]) dnf7 = ps.DNF([]) dnf7.append_and([A1, A2]) dnf7.append_or(ps.Conjunction([B1, B2])) self.df = pd.DataFrame.from_dict({ "A1": [1, 1, 1, 2, 2, 2, 2, 0, 0, 0], #pylint: disable=attribute-defined-outside-init "A2": [0, 1, 1, 1, 2, 2, 2, 0, 0, 0], "B1": [0, 0, 0, 0, 1, 1, 1, 0, 1, 1], "B2": ["0", "0", "0", "0", "1", "1", "2", "0", "0", "1"] }) self.check_dataframe_query(dnf1, [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]) self.check_dataframe_query(dnf3, [0, 1, 1, 0, 0, 0, 0, 0, 0, 0]) self.check_dataframe_query(dnf6, [0, 0, 0, 0, 1, 1, 0, 0, 0, 1]) self.check_dataframe_query(dnf7, [0, 1, 1, 0, 1, 1, 0, 0, 0, 1])
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_savings_status = ps.EqualitySelector("savings_status", b"<100") NS_job = ps.EqualitySelector("job", b"skilled") self.result = [ps.Conjunction([NS_checking, NS_foreign_worker]), ps.Conjunction([NS_checking]), ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_other_parties]), ps.Conjunction([NS_checking, NS_savings_status, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_savings_status]), ps.Conjunction([NS_checking, NS_savings_status, NS_other_parties, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_job, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_savings_status, NS_other_parties]), ps.Conjunction([NS_checking, NS_job]), ] self.qualities = [0.055299999999999995, 0.05280000000000001, 0.052300000000000006, 0.05059999999999999, 0.04959999999999999, 0.048299999999999996, 0.04660000000000001, 0.04550000000000001, 0.0452, 0.044399999999999995] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace = ps.create_nominal_selectors(data, ignore=['class']) self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=10, depth=5, qf=ps.StandardQF(1.0))
def test_equality_expressions(self): A1 = ps.EqualitySelector("A", 1) A2 = ps.EqualitySelector("A", 2, "AA") B1 = ps.EqualitySelector("B", 1) D1 = ps.Disjunction([A1, A2]) D1_clone = ps.Disjunction([A1, A2]) self.assertTrue(D1 == D1_clone) self.assertTrue(hash(D1) == hash(D1_clone)) D_all = ps.Disjunction([A1, A2, B1]) D1_clone.append_or(B1) self.assertTrue(D_all == D1_clone) self.assertTrue(hash(D_all) == hash(D1_clone)) C1 = ps.Conjunction([A1, A2]) C1_clone = ps.Conjunction([A1, A2]) self.assertTrue(C1 == C1_clone) self.assertTrue(hash(C1) == hash(C1_clone)) C_all = ps.Conjunction([A1, A2, B1]) C1_clone.append_and(B1) self.assertTrue(C_all == C1_clone) self.assertTrue(hash(C_all) == hash(C1_clone)) self.assertFalse(C1 == D1) self.assertFalse(hash(C1) == hash(D1))
def test_CountTarget2(self): df = self.df self.ga_qf.calculate_constant_statistics(task_dummy(df, None)) ga_score = self.ga_qf.evaluate(ps.Conjunction([self.A1, self.BA]), df) A_B_score = self.qf.evaluate(ps.Conjunction([self.A1, self.BA]), df) zero_score = self.qf.evaluate(ps.Conjunction([]), df) self.assertEqual(ga_score, A_B_score - zero_score)
def test_CountTarget1(self): df = self.df target = ps.FITarget() self.ga_qf.calculate_constant_statistics(df, target) ga_score = self.ga_qf.evaluate(ps.Conjunction([self.A1]), target, df) A1_score = self.qf.evaluate(ps.Conjunction([self.A1]), target, df) zero_score = self.qf.evaluate(ps.Conjunction([]), target, df) self.assertEqual(ga_score, A1_score - zero_score) ga2_score = self.ga_qf.evaluate(ps.Conjunction([self.A1]), target, df) self.assertEqual(ga2_score, ga_score)
def execute(self, task): task.qf.calculate_constant_statistics(task) result = [] all_selectors = chain.from_iterable( combinations(task.search_space, r) for r in range(1, task.depth + 1)) if self.show_progress: try: from tqdm import tqdm def binomial(x, y): try: binom = factorial(x) // factorial(y) // factorial(x - y) except ValueError: binom = 0 return binom total = sum( binomial(len(task.search_space), k) for k in range(1, task.depth + 1)) all_selectors = tqdm(all_selectors, total=total) except ImportError: pass for selectors in all_selectors: sg = ps.Conjunction(selectors) statistics = task.qf.calculate_statistics(sg, task.data) quality = task.qf.evaluate(sg, statistics) ps.add_if_required(result, sg, quality, task) result.sort(key=lambda x: x[0], reverse=True) return ps.SubgroupDiscoveryResult(result, task)
def search_internal(self, task, prefix, modification_set, result, bitset): self.num_calls += 1 sg_size = bitset.sum() if sg_size == 0: return result target_values_sg = self.target_values[bitset] target_values_cs = np.cumsum(target_values_sg) sizes = np.arange(1, len(target_values_cs) + 1) mean_values_cs = target_values_cs / sizes tpl = DFSNumeric.tpl(sizes, mean_values_cs) qualities = self.evaluate(None, tpl) optimistic_estimate = np.max(qualities) if optimistic_estimate <= ps.minimum_required_quality(result, task): return result sg = ps.Conjunction(copy.copy(prefix)) quality = qualities[-1] ps.add_if_required(result, sg, quality, task) if len(prefix) < task.depth: new_modification_set = copy.copy(modification_set) for sel in modification_set: prefix.append(sel) new_bitset = bitset & self.bitsets[sel] new_modification_set.pop(0) self.search_internal(task, prefix, new_modification_set, result, new_bitset) # remove the sel again prefix.pop(-1) return result
def search_internal(self, task, prefix, modification_set, result, use_optimistic_estimates): sg = ps.Conjunction(copy.copy(prefix)) statistics = task.qf.calculate_statistics(sg, task.data) if use_optimistic_estimates and len( prefix) < task.depth and isinstance( task.qf, ps.BoundedInterestingnessMeasure): optimistic_estimate = task.qf.optimistic_estimate(sg, statistics) if not (optimistic_estimate > ps.minimum_required_quality( result, task)): return result quality = task.qf.evaluate(sg, statistics) ps.add_if_required(result, sg, quality, task) if len(prefix) < task.depth: new_modification_set = copy.copy(modification_set) for sel in modification_set: prefix.append(sel) new_modification_set.pop(0) self.search_internal(task, prefix, new_modification_set, result, use_optimistic_estimates) # remove the sel again prefix.pop(-1) return result
def execute(self, task): result = [] queue = [(float("-inf"), ps.Conjunction([]))] operator = ps.StaticSpecializationOperator(task.search_space) task.qf.calculate_constant_statistics(task) while queue: q, old_description = heappop(queue) q = -q if not (q > ps.minimum_required_quality(result, task)): break for candidate_description in operator.refinements(old_description): sg = candidate_description statistics = task.qf.calculate_statistics(sg, task.data) ps.add_if_required(result, sg, task.qf.evaluate(sg, statistics), task) optimistic_estimate = task.qf.optimistic_estimate( sg, statistics) # compute refinements and fill the queue if len( candidate_description ) < task.depth and optimistic_estimate >= ps.minimum_required_quality( result, task): heappush(queue, (-optimistic_estimate, candidate_description)) result.sort(key=lambda x: x[0], reverse=True) return ps.SubgroupDiscoveryResult(result, task)
def execute(self, task): # adapt beam width to the result set size if desired if self.beam_width_adaptive: self.beam_width = task.result_set_size # check if beam size is to small for result set if self.beam_width < task.result_set_size: raise RuntimeError( 'Beam width in the beam search algorithm is smaller than the result set size!' ) task.qf.calculate_constant_statistics(task) # init beam = [(0, ps.Conjunction([]), task.qf.calculate_statistics(slice(None), task.data))] last_beam = None depth = 0 while beam != last_beam and depth < task.depth: last_beam = beam.copy() for (_, last_sg, _) in last_beam: if not getattr(last_sg, 'visited', False): setattr(last_sg, 'visited', True) for sel in task.search_space: # create a clone new_selectors = list(last_sg.selectors) if sel not in new_selectors: new_selectors.append(sel) sg = ps.Conjunction(new_selectors) statistics = task.qf.calculate_statistics( sg, task.data) quality = task.qf.evaluate(sg, statistics) ps.add_if_required(beam, sg, quality, task, check_for_duplicates=True, statistics=statistics) depth += 1 # TODO make sure there is no bug here result = beam[:task.result_set_size] result.sort(key=lambda x: x[0], reverse=True) return ps.SubgroupDiscoveryResult(result, task)
def get_max_generalization_mean(data, subgroup, weighting_attribute=None): selectors = subgroup.subgroup_description.selectors generalizations = ps.powerset(selectors) max_mean = 0 for sels in generalizations: sg = ps.Subgroup(subgroup.target, ps.Conjunction(list(sels))) mean_sg = sg.get_base_statistics(data, weighting_attribute)[3] max_mean = max(max_mean, mean_sg) return max_mean
def get_stats_and_previous_stats(self, subgroup, data): stats_subgroup = self.qf.calculate_statistics(subgroup, data) max_stats = self.stats0 selectors = subgroup.selectors if len(selectors) > 0: # compute quality of all generalizations generalizations = combinations(selectors, len(selectors)-1) for sels in generalizations: sgd = ps.Conjunction(list(sels)) (stats_sg, stats_prev) = self.calculate_statistics(sgd, data) max_stats = self.get_max(max_stats, stats_sg, stats_prev) return (stats_subgroup, max_stats)
def get_qual_and_previous_qual(self, subgroup, target, data): q_subgroup = self.qf.evaluate(subgroup, target, data) max_q = 0 selectors = subgroup.selectors if len(selectors) > 0: # compute quality of all generalizations generalizations = combinations(selectors, len(selectors) - 1) for sels in generalizations: sgd = ps.Conjunction(list(sels)) (q_sg, q_prev) = self.calculate_statistics(sgd, target, data) max_q = max(max_q, q_sg, q_prev) return (q_subgroup, max_q)
def setUp(self): NS_cabin = ps.EqualitySelector("Cabin", np.nan) NS_embarked = ps.EqualitySelector("Embarked", 'S') NS_embarked2 = ps.EqualitySelector("Embarked", 'C') NS_male = ps.EqualitySelector("Sex", 'male') NS_female = ps.EqualitySelector("Sex", 'female') #NS_other_parties = ps.EqualitySelector("other_parties", b"none") #NS_savings_status = ps.EqualitySelector("savings_status", b"<100") #NS_job = ps.EqualitySelector("job", b"skilled") self.result = [ ps.Conjunction([NS_cabin, NS_embarked]), ps.Conjunction([NS_cabin, NS_male]), ps.Conjunction([NS_embarked, NS_male]), ps.Conjunction([NS_cabin]), ps.Conjunction([NS_embarked]), ps.Conjunction([NS_male]), ps.Conjunction([NS_cabin, NS_female]), ps.Conjunction([NS_embarked, NS_female]), ps.Conjunction([NS_female]), ps.Conjunction([NS_cabin, NS_embarked2]), ] self.qualities = [178, 164, 146, 125, 110, 100, 86, 74, 56, 46] data = get_titanic_data() self.qualities2 = [ np.count_nonzero(conj.covers(data)) * conj.depth for conj in self.result ] self.assertEqual(self.qualities, self.qualities2) searchSpace = ps.create_nominal_selectors(data) self.task = ps.SubgroupDiscoveryTask(data, ps.FITarget, searchSpace, result_set_size=10, depth=2, qf=ps.AreaQF())
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_savings_status = ps.EqualitySelector("savings_status", b"<100") NS_payment_plans = ps.EqualitySelector("other_payment_plans", b"none") self.result = [ ps.Conjunction([NS_checking, NS_foreign_worker]), ps.Conjunction([NS_checking]), ps.Conjunction([NS_checking, NS_other_parties, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_other_parties]), ps.Conjunction([NS_checking, NS_savings_status, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_payment_plans]), ps.Conjunction([NS_checking, NS_payment_plans]), ps.Conjunction([NS_foreign_worker, NS_savings_status]), ps.Conjunction( [NS_foreign_worker, NS_other_parties, NS_savings_status]), ] self.qualities = [ 0.055299999999999995, 0.05280000000000001, 0.052300000000000006, 0.05059999999999999, 0.04959999999999999, 0.048299999999999996, 0.0426, 0.04, 0.03869999999999999, 0.03750000000000001 ] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace = ps.create_nominal_selectors(data, ignore=['class']) self.task = ps.SubgroupDiscoveryTask( data, target, searchSpace, result_set_size=10, depth=5, qf=ps.StandardQF(1.0), constraints=[ps.MinSupportConstraint(200)])
def calculate_quality_function_for_patterns(self, patterns, selectors_sorted, arrs): out = [] for indices, gp_params in self.tqdm( patterns, 'computing quality function', ): if len(indices) > 0: selectors = [selectors_sorted[i] for i in indices] #print(selectors, stats) sg = ps.Conjunction(selectors) if self.requires_cover_arr: statistics = task.qf.gp_get_params( np.all([arrs[i] for i in indices]), gp_params) else: statistics = task.qf.gp_get_params(None, gp_params) #qual1 = task.qf.evaluate(sg, task.qf.calculate_statistics(sg, task.data)) qual2 = task.qf.evaluate(sg, statistics) out.append((qual2, sg)) return out
def execute(self, task): result = [] queue = [(float("-inf"), ps.Conjunction([]))] operator = SpecializationOperator(data=task.data.drop(['target'], axis=1), n_bins=self.n_bins, max_features=self.max_features, intervals_only=self.intervals_only, binning=self.binning, specialization=self.specialization, search_space=task.search_space) task.qf.calculate_constant_statistics(task.data, task.target) while queue: q, old_description = heappop(queue) q = -q if not q > ps.minimum_required_quality(result, task): break for candidate_description in operator.refinements(old_description): score_eval = task.qf.evaluate(candidate_description, task.target, task.data, None) ps.add_if_required(result, candidate_description, score_eval, task) if len(candidate_description) < task.depth: heappush(queue, (-score_eval, candidate_description)) result.sort(key=lambda x: x[0], reverse=True) return ps.SubgroupDiscoveryResult(result, task)
def setUp(self): NS_checking = ps.EqualitySelector("checking_status", b"<0") NS_foreign_worker = ps.EqualitySelector("foreign_worker", b"yes") NS_other_parties = ps.EqualitySelector("other_parties", b"none") NS_savings_status = ps.EqualitySelector("savings_status", b"<100") NS_job = ps.EqualitySelector("job", b"skilled") NS_dependents = ps.EqualitySelector("num_dependents", 1.0) self.result = [ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_other_parties, NS_savings_status]), # AND job=='b'skilled'' AND other_parties=='b'none'' AND savings_status=='b'<100' # 0.113713540226172: checking_status=='b'<0'' AND foreign_worker=='b'yes'' AND job=='b'skilled'' AND savings_status=='b'<100'' ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_job]), # checking_status=='b'<0'' AND foreign_worker=='b'yes'' AND job=='b'skilled'' # checking_status=='b'<0'' AND job=='b'skilled'' AND other_parties=='b'none'' AND savings_status=='b'<100'' ps.Conjunction([NS_checking, NS_job, NS_other_parties, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_other_parties]), ps.Conjunction([NS_checking, NS_job, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_other_parties, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_other_parties]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_savings_status]), ps.Conjunction([NS_checking, NS_foreign_worker]), ps.Conjunction([NS_checking, NS_foreign_worker, NS_job, NS_dependents, NS_savings_status]), ps.Conjunction([NS_checking, NS_job, NS_other_parties])] self.qualities = [0.11457431093955019, 0.113713540226172, 0.11201325679119281, 0.1117538749727658, 0.11161046793076415, 0.11145710640046322, 0.11045259291161472, 0.10929088624672183, 0.10875519439407161, 0.10866138825404954, 0.10832735026213287, 0.10813405094128754] data = get_credit_data() target = ps.BinaryTarget('class', b'bad') searchSpace_Nominal = ps.create_nominal_selectors(data, ignore=['class']) searchSpace_Numeric = ps.create_numeric_selectors(data, ignore=['class']) searchSpace = searchSpace_Nominal + searchSpace_Numeric self.task = ps.SubgroupDiscoveryTask(data, target, searchSpace, result_set_size=12, depth=5, qf=ps.StandardQF(0.5))