Python Subgroup Examples, pysubgroup.Subgroup Python Examples

Example #1

0

Show file

    def search_internal(self, task, prefix, modificationSet, result, bitset):
        sg = ps.Subgroup(task.target, copy.copy(prefix))

        sgSize = bitset.count()
        positiveInstances = bitset & self.targetBitset
        sgPositiveCount = positiveInstances.count()

        optimisticEstimate = task.qf.optimistic_estimate_from_statistics(
            self.popSize, self.popPositives, sgSize, sgPositiveCount)
        if (optimisticEstimate <= ps.minimum_required_quality(result, task)):
            return result

        quality = task.qf.evaluate_from_statistics(self.popSize,
                                                   self.popPositives, sgSize,
                                                   sgPositiveCount)
        ps.add_if_required(result, sg, quality, task)

        if len(prefix) < task.depth:
            newModificationSet = copy.copy(modificationSet)
            for sel in modificationSet:
                prefix.append(sel)
                newBitset = bitset & self.bitsets[sel]
                newModificationSet.pop(0)
                self.search_internal(task, prefix, newModificationSet, result,
                                     newBitset)
                # remove the sel again
                prefix.pop(-1)
        return result

Example #2

0

Show file

    def search_internal(self, task, prefix, modification_set, result, bitset):
        sg_size = bitset.sum()
        if sg_size == 0:
            return
        target_values_sg = self.target_values[bitset]

        target_values_cs = np.cumsum(target_values_sg)
        mean_values_cs = target_values_cs / (np.arange(len(target_values_cs)) +
                                             1)
        qualities = self.f(
            np.arange(len(target_values_cs)) + 1, mean_values_cs)
        optimistic_estimate = np.max(qualities)

        if optimistic_estimate <= ps.minimum_required_quality(result, task):
            return result

        sg = ps.Subgroup(task.target, copy.copy(prefix))

        quality = qualities[-1]
        ps.add_if_required(result, sg, quality, task)

        if len(prefix) < task.depth:
            new_modification_set = copy.copy(modification_set)
            for sel in modification_set:
                prefix.append(sel)
                new_bitset = bitset & self.bitsets[sel]
                new_modification_set.pop(0)
                self.search_internal(task, prefix, new_modification_set,
                                     result, new_bitset)
                # remove the sel again
                prefix.pop(-1)
        return result

Example #3

0

Show file

    def search_internal(self, task, prefix, modification_set, result, bitset):

        sg_size = bitset.sum()
        positive_instances = np.logical_and(bitset, self.target_bitset)
        sg_positive_count = positive_instances.sum()

        optimistic_estimate = task.qf.optimistic_estimate_from_statistics(
            self.pop_size, self.pop_positives, sg_size, sg_positive_count)
        if optimistic_estimate <= ps.minimum_required_quality(result, task):
            return result

        sg = ps.Subgroup(task.target, copy.copy(prefix))

        quality = task.qf.evaluate_from_statistics(self.pop_size,
                                                   self.pop_positives, sg_size,
                                                   sg_positive_count)
        ps.add_if_required(result, sg, quality, task)

        if len(prefix) < task.depth:
            new_modification_set = copy.copy(modification_set)
            for sel in modification_set:
                prefix.append(sel)
                newBitset = np.logical_and(bitset, self.bitsets[sel])
                new_modification_set.pop(0)
                self.search_internal(task, prefix, new_modification_set,
                                     result, newBitset)
                # remove the sel again
                prefix.pop(-1)
        return result

Example #4

0

Show file

    def search_internal(self, task, prefix, modification_set, result,
                        use_optimistic_estimates):
        sg = ps.Subgroup(task.target,
                         ps.SubgroupDescription(copy.copy(prefix)))

        if use_optimistic_estimates and len(
                prefix) < task.depth and isinstance(
                    task.qf, ps.BoundedInterestingnessMeasure):
            optimistic_estimate = task.qf.optimistic_estimate_from_dataset(
                task.data, sg)
            if optimistic_estimate <= ps.minimum_required_quality(
                    result, task):
                return result

        if task.qf.supports_weights():
            quality = task.qf.evaluate_from_dataset(task.data, sg,
                                                    task.weighting_attribute)
        else:
            quality = task.qf.evaluate_from_dataset(task.data, sg)
        ps.add_if_required(result, sg, quality, task)

        if len(prefix) < task.depth:
            new_modification_set = copy.copy(modification_set)
            for sel in modification_set:
                prefix.append(sel)
                new_modification_set.pop(0)
                self.search_internal(task, prefix, new_modification_set,
                                     result, use_optimistic_estimates)
                # remove the sel again
                prefix.pop(-1)
        return result

Example #5

0

Show file

def get_max_generalization_mean(data, subgroup, weighting_attribute=None):
    selectors = subgroup.subgroup_description.selectors
    generalizations = ps.powerset(selectors)
    max_mean = 0
    for sels in generalizations:
        sg = ps.Subgroup(subgroup.target, ps.Conjunction(list(sels)))
        mean_sg = sg.get_base_statistics(data, weighting_attribute)[3]
        max_mean = max(max_mean, mean_sg)
    return max_mean

Example #6

0

Show file

    def execute(self, task):
        result = []
        queue = []
        measure_statistics_based = hasattr(
            task.qf, 'optimistic_estimate_from_statistics')

        # init the first level
        for sel in task.search_space:
            queue.append((float("-inf"), [sel]))

        while queue:
            q, candidate_description = heappop(queue)
            q = -q
            if q < ps.minimum_required_quality(result, task):
                break

            sg = ps.Subgroup(task.target, candidate_description)

            if measure_statistics_based:
                statistics = sg.get_base_statistics(task.data)
                ps.add_if_required(
                    result, sg, task.qf.evaluate_from_statistics(*statistics),
                    task)
                optimistic_estimate = task.qf.optimistic_estimate_from_statistics(
                    *statistics) if isinstance(
                        task.qf,
                        ps.BoundedInterestingnessMeasure) else float("inf")
            else:
                ps.add_if_required(
                    result, sg, task.qf.evaluate_from_dataset(task.data, sg),
                    task)
                optimistic_estimate = task.qf.optimistic_estimate_from_dataset(
                    task.data, sg) if isinstance(
                        task.qf,
                        ps.BoundedInterestingnessMeasure) else float("inf")

            # compute refinements and fill the queue
            if len(
                    candidate_description
            ) < task.depth and optimistic_estimate >= ps.minimum_required_quality(
                    result, task):
                # iterate over all selectors that are behind the last selector contained in the evaluated candidate
                # according to the initial order
                index_of_last_selector = min(
                    task.search_space.index(candidate_description[-1]),
                    len(task.search_space) - 1)

                for sel in islice(task.search_space,
                                  index_of_last_selector + 1, None):
                    new_description = candidate_description + [sel]
                    heappush(queue, (-optimistic_estimate, new_description))
        result.sort(key=lambda x: x[0], reverse=True)
        return result

Example #7

0

Show file

    def execute(self, task):
        # adapt beam width to the result set size if desired
        if self.beam_width_adaptive:
            self.beam_width = task.result_set_size

        # check if beam size is to small for result set
        if self.beam_width < task.result_set_size:
            raise RuntimeError(
                'Beam width in the beam search algorithm is smaller than the result set size!'
            )

        # init
        beam = [(0, ps.Subgroup(task.target, []))]
        last_beam = None

        depth = 0
        while beam != last_beam and depth < task.depth:
            last_beam = beam.copy()
            for (_, last_sg) in last_beam:
                for sel in task.search_space:
                    # create a clone
                    new_selectors = list(
                        last_sg.subgroup_description.selectors)
                    if not sel in new_selectors:
                        new_selectors.append(sel)
                        sg = ps.Subgroup(task.target, new_selectors)
                        quality = task.qf.evaluate_from_dataset(task.data, sg)
                        ps.add_if_required(beam,
                                           sg,
                                           quality,
                                           task,
                                           check_for_duplicates=True)
            depth += 1

        result = beam[:task.result_set_size]
        result.sort(key=lambda x: x[0], reverse=True)
        return result

Example #8

0

Show file

from scipy.io import arff
import pysubgroup as ps
import pandas as pd

import pprint
pp = pprint.PrettyPrinter(indent=4)

data = pd.DataFrame(arff.loadarff("../data/credit-g.arff")[0])

target = ps.NumericTarget('credit_amount')
sg = ps.Subgroup(target, ps.NominalSelector("purpose", b"other"))
print(target.get_base_statistics(data, sg))
sg.calculateStatistics(data)
# pp.pprint (sg.statistics)

qf = ps.StandardQF_numeric(1.0)
print(qf.evaluateFromDataset(data, sg))

Example #9

0

Show file

import pysubgroup as ps
import pandas as pd
import numpy as np

import pprint

pp = pprint.PrettyPrinter(indent=4)

data = np.array([[1, 2, 3, 4, 5], ["F", "F", "F", "Tr", "Tr"]]).T
data = pd.DataFrame(data, columns=["Target", "A"])
data["Target"] = pd.to_numeric(data["Target"])

target = ps.NumericTarget('Target')
print(data[target.target_variable])
sg = ps.Subgroup(target, ps.NominalSelector("A", "Tr"))
print(target.get_base_statistics(data, sg))
sg.calculateStatistics(data)
# pp.pprint (sg.statistics)
qf = ps.StandardQF_numeric(1.0)
print(qf.optimisticEstimateFromDataset(data, sg))

Example #10

0

Show file

    def execute(self, task):
        measure_statistics_based = hasattr(
            task.qf, 'optimistic_estimate_from_statistics')
        result = []

        # init the first level
        next_level_candidates = []
        for sel in task.search_space:
            next_level_candidates.append(ps.Subgroup(task.target, [sel]))

        # level-wise search
        depth = 1
        while next_level_candidates:
            # check sgs from the last level
            promising_candidates = []
            for sg in next_level_candidates:
                if measure_statistics_based:
                    statistics = sg.get_base_statistics(task.data)
                    ps.add_if_required(
                        result, sg,
                        task.qf.evaluate_from_statistics(*statistics), task)
                    optimistic_estimate = task.qf.optimistic_estimate_from_statistics(
                        *statistics) if isinstance(
                            task.qf,
                            ps.BoundedInterestingnessMeasure) else float("inf")
                else:
                    ps.add_if_required(
                        result, sg,
                        task.qf.evaluate_from_dataset(task.data, sg), task)
                    optimistic_estimate = task.qf.optimistic_estimate_from_dataset(
                        task.data, sg) if isinstance(
                            task.qf,
                            ps.BoundedInterestingnessMeasure) else float("inf")

                # optimistic_estimate = task.qf.optimistic_estimate_from_dataset(task.data, sg)
                # if isinstance(task.qf, m.BoundedInterestingnessMeasure) else float("inf")
                # quality = task.qf.evaluate_from_dataset(task.data, sg)
                # ut.add_if_required (result, sg, quality, task)
                if optimistic_estimate >= ps.minimum_required_quality(
                        result, task):
                    promising_candidates.append(
                        sg.subgroup_description.selectors)

            if depth == task.depth:
                break

            # generate candidates next level
            next_level_candidates = []
            for i, sg1 in enumerate(promising_candidates):
                for j, sg2 in enumerate(promising_candidates):
                    if i < j and sg1[:-1] == sg2[:-1]:
                        candidate = list(sg1) + [sg2[-1]]
                        # check if ALL generalizations are contained in promising_candidates
                        generalization_descriptions = [[
                            x for x in candidate if x != sel
                        ] for sel in candidate]
                        if all(g in promising_candidates
                               for g in generalization_descriptions):
                            next_level_candidates.append(
                                ps.Subgroup(task.target, candidate))
            depth = depth + 1

        result.sort(key=lambda x: x[0], reverse=True)
        return result

Example #11

0

Show file

File: utils.py Project: rebelosa/pysubgroup

 def to_subgroups(self):
     return [(quality, ps.Subgroup(self.task.target, description))
             for quality, description in self.results]

Example #12

0

Show file

'''
Created on 10.05.2017

@author: lemmerfn
'''
import pandas as pd
import pysubgroup as ps

if __name__ == '__main__':
    data = pd.read_csv("~/datasets/titanic.csv")
    target = ps.NominalSelector('survived', 0)

    s1 = ps.Subgroup(target, [])
    s2 = ps.Subgroup(target, [])

    print(s1 == s2)