def test_priority_queue_contains(self):
        queue = MinPriorityQueue(max_size=1)

        data = pd.DataFrame({
            'A': [1, 2],
            'B': [1, 2]
        })  # Pass data because descriptions need to be evaluated

        description1 = description_factory(Condition('A == 1'), data)
        description2 = description_factory(
            Condition('A == 1'), data)  # Different object BUT same contents
        description3 = description_factory(
            Condition('B == 2'),
            data)  # Different object AND different contents

        item1 = Result(quality=1, description=description1)
        item2 = Result(quality=1, description=description2)
        item3 = Result(quality=1, description=description3)

        queue.put(item1)

        self.assertTrue(queue.contains(item2),
                        'Compares hashed value instead of references')
        self.assertFalse(queue.contains(item3),
                         'Hashes quality as well as description')
    def test_priority_queue_equal_quality(self):
        queue = MinPriorityQueue(max_size=2)

        data = pd.DataFrame({
            'A': [1, 2],
            'B': [1, 2]
        })  # Pass data because descriptions need to be evaluated

        description1 = description_factory(Condition('A == 1'), data)
        description2 = description_factory(
            Condition('B >= 1'), data)  # Weaker description (larger coverage)

        item1 = Result(quality=1, description=description1)
        item2 = Result(
            quality=1,
            description=description2)  # Same quality, different description

        queue.put(item1)
        queue.put(item2)

        result = list(queue)

        self.assertEqual(item2, result[0],
                         "The description 'B >= 1' is weaker")
        self.assertEqual(item1, result[1],
                         "The description 'A == 1' is stronger")
Exemple #3
0
    def test_refine_empty_subgroup(self):
        dataset = pd.DataFrame({'A': [1]})

        condition = Condition('`A` == 2')  # Condition matches 0 rows
        seed = description_factory(condition, dataset)

        descriptions = refine(dataset, [], seed)

        self.assertEqual(1, len(descriptions), 'No refinements added')
Exemple #4
0
    def test_refine_duplicate_splits(self):
        dataset = pd.DataFrame({'A': [0, 1, 1, 2]})

        # With 3 equal-width bins, the splits will be duplicated. Splits at position 1 and 2
        descriptions = refine(dataset, [], description_factory([], dataset))

        # Shouldn't contain `A >= 1` and `A <= 1` twice
        self.assertEqual(3, len(descriptions),
                         'Only contains unique inequalities')
    def test_priority_queue_overflowing(self):
        queue = MinPriorityQueue(max_size=2)

        description1 = description_factory(Condition('L'))
        description2 = description_factory(Condition('M'))
        description3 = description_factory(Condition('H'))

        item1 = Result(quality=1, description=description1)
        item2 = Result(quality=2, description=description2)
        item3 = Result(quality=3, description=description3)

        queue.put(item1)
        queue.put(item2)
        queue.put(item3)

        self.assertFalse(queue.contains(item1), 'Removed low quality item')
        self.assertTrue(queue.contains(item2), 'Contains medium priority item')
        self.assertTrue(queue.contains(item3), 'Contains high priority item')
Exemple #6
0
    def test_refine_boolean(self):
        dataset = pd.DataFrame({'A': [True, False]})

        descriptions = refine(dataset, [], description_factory([], dataset))
        queries = [d.to_querystring() for d in descriptions]

        self.assertEqual(len(descriptions), 3, 'Added 2 conditions')
        self.assertEqual('`A` == 1' in queries, True,
                         'Added condition equal to True')
        self.assertEqual('`A` == 0' in queries, True,
                         'Added condition equal to False')
    def test_priority_queue_list(self):
        queue = MinPriorityQueue(max_size=3)

        description1 = description_factory(Condition('C'))
        description2 = description_factory(Condition('B'))
        description3 = description_factory(Condition('A'))

        # Insert items on purpose in non-ascending order to force rebuilding the heap
        queue.put(Result(quality=3, description=description1))
        queue.put(Result(quality=2, description=description2))
        queue.put(Result(quality=1, description=description3))

        result = list(queue)

        self.assertEqual(len(result), 3)

        # Make sure the list of correctly ordered (low to high), i.e. not a dump of the heap
        self.assertEqual('A', result[0].description.to_querystring())
        self.assertEqual('B', result[1].description.to_querystring())
        self.assertEqual('C', result[2].description.to_querystring())
Exemple #8
0
    def test_refine_numeric(self):
        dataset = pd.DataFrame({'A': [1, 2, 3, 4]})

        descriptions = refine(dataset, [], description_factory([], dataset))
        queries = [d.to_querystring() for d in descriptions]

        self.assertEqual(5, len(descriptions),
                         'Added 4 conditions (2 * (num_buckets - 1))')
        self.assertIn('`A` <= 2', queries)
        self.assertIn('`A` >= 2', queries)
        self.assertIn('`A` <= 3', queries)
        self.assertIn('`A` >= 3', queries)
Exemple #9
0
    def test_refine_nominal(self):
        dataset = pd.DataFrame({'A': ['foo', 'bar', 'lex']})

        descriptions = refine(dataset, [], description_factory([], dataset))
        queries = [d.to_querystring() for d in descriptions]

        self.assertEqual(7, len(descriptions), 'Added 4 conditions (2g)')
        self.assertIn("`A` == 'foo'", queries, 'Added condition equal to g(1)')
        self.assertIn("`A` != 'foo'", queries,
                      'Added condition not equal to g(1)')
        self.assertIn("`A` == 'bar'", queries,
                      'Added condition equal to g(2))')
        self.assertIn("`A` != 'bar'", queries,
                      'Added condition not equal to g(2)')
        self.assertIn("`A` == 'lex'", queries,
                      'Added condition equal to g(3))')
        self.assertIn("`A` != 'lex'", queries,
                      'Added condition not equal to g(3)')
Exemple #10
0
    def test_calculate(self):
        data = pd.DataFrame({'foo': ['a', 'a', 'b', 'c']})
        description = description_factory(Condition("`foo` == 'b'"), data)

        # Label ranking of population: [a: 1, b: 2, c: 3]
        # Label ranking of subgroup: [a: 2, b: 1, c: 3]

        t = tree.Node('a', children=[tree.Node('b'), tree.Node('c')])

        qm = LabelDistribution(
            target='foo', tree=t, gap_func=lambda x, y: 1
        )  # Gap component is 1 so all subgroups are exceptional
        qm.set_data(data)

        coverage, quality = qm.calculate(description)

        self.assertEqual(1, coverage)
        self.assertNotEqual(
            0, quality
        )  # Subgroup has a different label ranking so the quality should be non-zero
Exemple #11
0
    def test_refine_unsupported_type(self):
        dataset = pd.DataFrame({'A': [datetime.now()]
                                })  # No refinement implemented for dates

        with self.assertRaises(NotImplementedError):
            refine(dataset, [], description_factory([], dataset))
def get_initial_seed(data: pd.DataFrame) -> Result:
    return Result(quality=-1,
                  description=description_factory(conditions=[], data=data))