def test_refine_empty_subgroup(self): dataset = pd.DataFrame({'A': [1]}) condition = Condition('`A` == 2') # Condition matches 0 rows seed = description_factory(condition, dataset) descriptions = refine(dataset, [], seed) self.assertEqual(1, len(descriptions), 'No refinements added')
def test_refine_duplicate_splits(self): dataset = pd.DataFrame({'A': [0, 1, 1, 2]}) # With 3 equal-width bins, the splits will be duplicated. Splits at position 1 and 2 descriptions = refine(dataset, [], description_factory([], dataset)) # Shouldn't contain `A >= 1` and `A <= 1` twice self.assertEqual(3, len(descriptions), 'Only contains unique inequalities')
def test_refine_boolean(self): dataset = pd.DataFrame({'A': [True, False]}) descriptions = refine(dataset, [], description_factory([], dataset)) queries = [d.to_querystring() for d in descriptions] self.assertEqual(len(descriptions), 3, 'Added 2 conditions') self.assertEqual('`A` == 1' in queries, True, 'Added condition equal to True') self.assertEqual('`A` == 0' in queries, True, 'Added condition equal to False')
def test_refine_numeric(self): dataset = pd.DataFrame({'A': [1, 2, 3, 4]}) descriptions = refine(dataset, [], description_factory([], dataset)) queries = [d.to_querystring() for d in descriptions] self.assertEqual(5, len(descriptions), 'Added 4 conditions (2 * (num_buckets - 1))') self.assertIn('`A` <= 2', queries) self.assertIn('`A` >= 2', queries) self.assertIn('`A` <= 3', queries) self.assertIn('`A` >= 3', queries)
def test_refine_nominal(self): dataset = pd.DataFrame({'A': ['foo', 'bar', 'lex']}) descriptions = refine(dataset, [], description_factory([], dataset)) queries = [d.to_querystring() for d in descriptions] self.assertEqual(7, len(descriptions), 'Added 4 conditions (2g)') self.assertIn("`A` == 'foo'", queries, 'Added condition equal to g(1)') self.assertIn("`A` != 'foo'", queries, 'Added condition not equal to g(1)') self.assertIn("`A` == 'bar'", queries, 'Added condition equal to g(2))') self.assertIn("`A` != 'bar'", queries, 'Added condition not equal to g(2)') self.assertIn("`A` == 'lex'", queries, 'Added condition equal to g(3))') self.assertIn("`A` != 'lex'", queries, 'Added condition not equal to g(3)')
def test_refine_unsupported_type(self): dataset = pd.DataFrame({'A': [datetime.now()] }) # No refinement implemented for dates with self.assertRaises(NotImplementedError): refine(dataset, [], description_factory([], dataset))
def beam_search(data, targets, quality_measure: QualityMeasure, options={}): timer = Timer() timer.start() # Keep track of execution time for debugging set_options(options) candidate_queue = Queue(maxsize=0) candidate_queue.put(get_initial_seed(data)) result_set = MinPriorityQueue(max_size=config.RESULT_SET_SIZE) # Print settings print('Settings:') for var in [x for x in dir(config) if not x.startswith('__')]: print(f' {var}={getattr(config, var)}') print('Setting-up quality measure...') quality_measure.set_data(data) print('Finding subgroups...') for depth in range(0, config.SEARCH_DEPTH): beam = MinPriorityQueue(max_size=config.BEAM_WIDTH) while not candidate_queue.empty(): seed = candidate_queue.get().description for description in refine(data, targets, seed): coverage, quality = quality_measure.calculate(description) # Check if the description satisfies the constraints if not satisfies_all(description=description, coverage=coverage, quality=quality): continue # Continue with next candidate description result = Result(quality=quality, description=description) # Check if the description is novel if not result_set.contains(result): result_set.put(result) beam.put(result) print(f'Best subgroups at depth {depth}:') while not beam.empty(): candidate = beam.get( ) # Not sure about get(), pseudo code uses get_front_element() print( f'quality = {round(candidate.quality, 5)}, description = {candidate.description.to_querystring()}' ) candidate_queue.put(candidate) top_q = list(result_set) top_q.reverse() # Sort by descending quality print('Done.') print(f'Finished in {round(timer.elapsed_time())} seconds.') return top_q