Ejemplo n.º 1
0
    def search_internal(self, task, prefix, modification_set, result, bitset):

        sg_size = bitset.sum()
        positive_instances = np.logical_and(bitset, self.target_bitset)
        sg_positive_count = positive_instances.sum()

        optimistic_estimate = task.qf.optimistic_estimate_from_statistics(
            self.pop_size, self.pop_positives, sg_size, sg_positive_count)
        if optimistic_estimate <= ps.minimum_required_quality(result, task):
            return result

        sg = ps.Subgroup(task.target, copy.copy(prefix))

        quality = task.qf.evaluate_from_statistics(self.pop_size,
                                                   self.pop_positives, sg_size,
                                                   sg_positive_count)
        ps.add_if_required(result, sg, quality, task)

        if len(prefix) < task.depth:
            new_modification_set = copy.copy(modification_set)
            for sel in modification_set:
                prefix.append(sel)
                newBitset = np.logical_and(bitset, self.bitsets[sel])
                new_modification_set.pop(0)
                self.search_internal(task, prefix, new_modification_set,
                                     result, newBitset)
                # remove the sel again
                prefix.pop(-1)
        return result
Ejemplo n.º 2
0
    def search_internal(self, task, prefix, modificationSet, result, bitset):
        sg = ps.Subgroup(task.target, copy.copy(prefix))

        sgSize = bitset.count()
        positiveInstances = bitset & self.targetBitset
        sgPositiveCount = positiveInstances.count()

        optimisticEstimate = task.qf.optimistic_estimate_from_statistics(
            self.popSize, self.popPositives, sgSize, sgPositiveCount)
        if (optimisticEstimate <= ps.minimum_required_quality(result, task)):
            return result

        quality = task.qf.evaluate_from_statistics(self.popSize,
                                                   self.popPositives, sgSize,
                                                   sgPositiveCount)
        ps.add_if_required(result, sg, quality, task)

        if len(prefix) < task.depth:
            newModificationSet = copy.copy(modificationSet)
            for sel in modificationSet:
                prefix.append(sel)
                newBitset = bitset & self.bitsets[sel]
                newModificationSet.pop(0)
                self.search_internal(task, prefix, newModificationSet, result,
                                     newBitset)
                # remove the sel again
                prefix.pop(-1)
        return result
Ejemplo n.º 3
0
    def get_next_level_candidates_vectorized(self, task, result,
                                             next_level_candidates):
        promising_candidates = []
        statistics = []
        optimistic_estimate_function = getattr(task.qf,
                                               self.optimistic_estimate_name)
        for sg in next_level_candidates:
            statistics.append(task.qf.calculate_statistics(sg, task.data))
        tpl_class = statistics[0].__class__
        vec_statistics = tpl_class._make(
            np.array(tpl) for tpl in zip(*statistics))
        qualities = task.qf.evaluate(None, vec_statistics)
        optimistic_estimates = optimistic_estimate_function(
            None, vec_statistics)

        for sg, quality, stats in zip(next_level_candidates, qualities,
                                      statistics):
            ps.add_if_required(result, sg, quality, task, statistics=stats)

        min_quality = ps.minimum_required_quality(result, task)
        for sg, optimistic_estimate in zip(next_level_candidates,
                                           optimistic_estimates):
            if optimistic_estimate >= min_quality:
                promising_candidates.append(sg.selectors)
        return promising_candidates
Ejemplo n.º 4
0
    def execute(self, task):
        task.qf.calculate_constant_statistics(task)
        result = []
        all_selectors = chain.from_iterable(
            combinations(task.search_space, r)
            for r in range(1, task.depth + 1))
        if self.show_progress:
            try:
                from tqdm import tqdm

                def binomial(x, y):
                    try:
                        binom = factorial(x) // factorial(y) // factorial(x -
                                                                          y)
                    except ValueError:
                        binom = 0
                    return binom

                total = sum(
                    binomial(len(task.search_space), k)
                    for k in range(1, task.depth + 1))
                all_selectors = tqdm(all_selectors, total=total)
            except ImportError:
                pass
        for selectors in all_selectors:
            sg = ps.Conjunction(selectors)
            statistics = task.qf.calculate_statistics(sg, task.data)
            quality = task.qf.evaluate(sg, statistics)
            ps.add_if_required(result, sg, quality, task)
        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
Ejemplo n.º 5
0
    def search_internal(self, task, prefix, modification_set, result, bitset):
        self.num_calls += 1
        sg_size = bitset.sum()
        if sg_size == 0:
            return result
        target_values_sg = self.target_values[bitset]

        target_values_cs = np.cumsum(target_values_sg)
        sizes = np.arange(1, len(target_values_cs) + 1)
        mean_values_cs = target_values_cs / sizes
        tpl = DFSNumeric.tpl(sizes, mean_values_cs)
        qualities = self.evaluate(None, tpl)
        optimistic_estimate = np.max(qualities)

        if optimistic_estimate <= ps.minimum_required_quality(result, task):
            return result

        sg = ps.Conjunction(copy.copy(prefix))

        quality = qualities[-1]
        ps.add_if_required(result, sg, quality, task)

        if len(prefix) < task.depth:
            new_modification_set = copy.copy(modification_set)
            for sel in modification_set:
                prefix.append(sel)
                new_bitset = bitset & self.bitsets[sel]
                new_modification_set.pop(0)
                self.search_internal(task, prefix, new_modification_set,
                                     result, new_bitset)
                # remove the sel again
                prefix.pop(-1)
        return result
Ejemplo n.º 6
0
    def search_internal(self, task, prefix, modification_set, result,
                        use_optimistic_estimates):
        sg = ps.Conjunction(copy.copy(prefix))

        statistics = task.qf.calculate_statistics(sg, task.data)
        if use_optimistic_estimates and len(
                prefix) < task.depth and isinstance(
                    task.qf, ps.BoundedInterestingnessMeasure):
            optimistic_estimate = task.qf.optimistic_estimate(sg, statistics)
            if not (optimistic_estimate > ps.minimum_required_quality(
                    result, task)):
                return result
        quality = task.qf.evaluate(sg, statistics)
        ps.add_if_required(result, sg, quality, task)

        if len(prefix) < task.depth:
            new_modification_set = copy.copy(modification_set)
            for sel in modification_set:
                prefix.append(sel)
                new_modification_set.pop(0)
                self.search_internal(task, prefix, new_modification_set,
                                     result, use_optimistic_estimates)
                # remove the sel again
                prefix.pop(-1)
        return result
Ejemplo n.º 7
0
    def search_internal(self, task, prefix, modification_set, result, bitset):
        sg_size = bitset.sum()
        if sg_size == 0:
            return
        target_values_sg = self.target_values[bitset]

        target_values_cs = np.cumsum(target_values_sg)
        mean_values_cs = target_values_cs / (np.arange(len(target_values_cs)) +
                                             1)
        qualities = self.f(
            np.arange(len(target_values_cs)) + 1, mean_values_cs)
        optimistic_estimate = np.max(qualities)

        if optimistic_estimate <= ps.minimum_required_quality(result, task):
            return result

        sg = ps.Subgroup(task.target, copy.copy(prefix))

        quality = qualities[-1]
        ps.add_if_required(result, sg, quality, task)

        if len(prefix) < task.depth:
            new_modification_set = copy.copy(modification_set)
            for sel in modification_set:
                prefix.append(sel)
                new_bitset = bitset & self.bitsets[sel]
                new_modification_set.pop(0)
                self.search_internal(task, prefix, new_modification_set,
                                     result, new_bitset)
                # remove the sel again
                prefix.pop(-1)
        return result
Ejemplo n.º 8
0
    def execute(self, task):
        result = []
        queue = [(float("-inf"), ps.Conjunction([]))]
        operator = ps.StaticSpecializationOperator(task.search_space)
        task.qf.calculate_constant_statistics(task)
        while queue:
            q, old_description = heappop(queue)
            q = -q
            if not (q > ps.minimum_required_quality(result, task)):
                break
            for candidate_description in operator.refinements(old_description):
                sg = candidate_description
                statistics = task.qf.calculate_statistics(sg, task.data)
                ps.add_if_required(result, sg,
                                   task.qf.evaluate(sg, statistics), task)
                optimistic_estimate = task.qf.optimistic_estimate(
                    sg, statistics)

                # compute refinements and fill the queue
                if len(
                        candidate_description
                ) < task.depth and optimistic_estimate >= ps.minimum_required_quality(
                        result, task):
                    heappush(queue,
                             (-optimistic_estimate, candidate_description))

        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
Ejemplo n.º 9
0
    def get_next_level_candidates(self, task, result, next_level_candidates):
        promising_candidates = []
        optimistic_estimate_function = getattr(task.qf,
                                               self.optimistic_estimate_name)
        for sg in next_level_candidates:
            statistics = task.qf.calculate_statistics(sg, task.data)
            ps.add_if_required(result,
                               sg,
                               task.qf.evaluate(sg, statistics),
                               task,
                               statistics=statistics)
            optimistic_estimate = optimistic_estimate_function(sg, statistics)

            if optimistic_estimate >= ps.minimum_required_quality(
                    result, task):
                if ps.constraints_hold(task.constraints_monotone, sg,
                                       statistics, task.data):
                    promising_candidates.append(
                        (optimistic_estimate, sg.selectors))
        min_quality = ps.minimum_required_quality(result, task)
        promising_candidates = [
            selectors for estimate, selectors in promising_candidates
            if estimate > min_quality
        ]
        return promising_candidates
Ejemplo n.º 10
0
    def search_internal(self, task, prefix, modification_set, result,
                        use_optimistic_estimates):
        sg = ps.Subgroup(task.target,
                         ps.SubgroupDescription(copy.copy(prefix)))

        if use_optimistic_estimates and len(
                prefix) < task.depth and isinstance(
                    task.qf, ps.BoundedInterestingnessMeasure):
            optimistic_estimate = task.qf.optimistic_estimate_from_dataset(
                task.data, sg)
            if optimistic_estimate <= ps.minimum_required_quality(
                    result, task):
                return result

        if task.qf.supports_weights():
            quality = task.qf.evaluate_from_dataset(task.data, sg,
                                                    task.weighting_attribute)
        else:
            quality = task.qf.evaluate_from_dataset(task.data, sg)
        ps.add_if_required(result, sg, quality, task)

        if len(prefix) < task.depth:
            new_modification_set = copy.copy(modification_set)
            for sel in modification_set:
                prefix.append(sel)
                new_modification_set.pop(0)
                self.search_internal(task, prefix, new_modification_set,
                                     result, use_optimistic_estimates)
                # remove the sel again
                prefix.pop(-1)
        return result
Ejemplo n.º 11
0
    def execute(self, task):
        result = []
        queue = []
        operator = ps.StaticGeneralizationOperator(task.search_space)
        # init the first level
        for sel in task.search_space:
            queue.append((float("-inf"), ps.Disjunction([sel])))
        task.qf.calculate_constant_statistics(task)

        while queue:
            q, candidate_description = heappop(queue)
            q = -q
            if q < ps.minimum_required_quality(result, task):
                break

            sg = candidate_description
            statistics = task.qf.calculate_statistics(sg, task.data)
            quality = task.qf.evaluate(sg, statistics)
            ps.add_if_required(result,
                               sg,
                               quality,
                               task,
                               statistics=statistics)

            qual = ps.minimum_required_quality(result, task)

            if (quality, sg) in result:
                new_queue = []
                for q_tmp, c_tmp in queue:
                    if (-q_tmp) > qual:
                        heappush(new_queue, (q_tmp, c_tmp))
                queue = new_queue
            optimistic_estimate = task.qf.optimistic_estimate(sg, statistics)
            # else:
            #    ps.add_if_required(result, sg, task.qf.evaluate_from_dataset(task.data, sg), task)
            #    optimistic_estimate = task.qf.optimistic_generalisation_from_dataset(task.data, sg) if qf_is_bounded else float("inf")

            # compute refinements and fill the queue
            if len(candidate_description) < task.depth and (
                    optimistic_estimate /
                    self.alpha**(len(candidate_description) + 1)
            ) >= ps.minimum_required_quality(result, task):
                # print(qual)
                # print(optimistic_estimate)
                self.refined[len(candidate_description)] += 1
                # print(str(candidate_description))
                for new_description in operator.refinements(
                        candidate_description):
                    heappush(queue, (-optimistic_estimate, new_description))
            else:
                self.discarded[len(candidate_description)] += 1

        result.sort(key=lambda x: x[0], reverse=True)
        for qual, sg in result:
            print("{} {}".format(qual, sg))
        print("discarded " + str(self.discarded))
        return ps.SubgroupDiscoveryResult(result, task)
Ejemplo n.º 12
0
    def search_internal(self, task, result, sg):
        statistics = task.qf.calculate_statistics(sg)
        optimistic_estimate = task.qf.optimistic_estimate(sg, statistics)
        if not (optimistic_estimate > ps.minimum_required_quality(
                result, task)):
            return
        quality = task.qf.evaluate(sg, statistics)
        ps.add_if_required(result, sg, quality, task)

        if sg.depth < task.depth and statistics.size > 0:
            for new_sg in self.operator.refinements(sg):
                self.search_internal(task, result, new_sg)
Ejemplo n.º 13
0
    def execute(self, task):
        result = []
        queue = []
        measure_statistics_based = hasattr(
            task.qf, 'optimistic_estimate_from_statistics')

        # init the first level
        for sel in task.search_space:
            queue.append((float("-inf"), [sel]))

        while queue:
            q, candidate_description = heappop(queue)
            q = -q
            if q < ps.minimum_required_quality(result, task):
                break

            sg = ps.Subgroup(task.target, candidate_description)

            if measure_statistics_based:
                statistics = sg.get_base_statistics(task.data)
                ps.add_if_required(
                    result, sg, task.qf.evaluate_from_statistics(*statistics),
                    task)
                optimistic_estimate = task.qf.optimistic_estimate_from_statistics(
                    *statistics) if isinstance(
                        task.qf,
                        ps.BoundedInterestingnessMeasure) else float("inf")
            else:
                ps.add_if_required(
                    result, sg, task.qf.evaluate_from_dataset(task.data, sg),
                    task)
                optimistic_estimate = task.qf.optimistic_estimate_from_dataset(
                    task.data, sg) if isinstance(
                        task.qf,
                        ps.BoundedInterestingnessMeasure) else float("inf")

            # compute refinements and fill the queue
            if len(
                    candidate_description
            ) < task.depth and optimistic_estimate >= ps.minimum_required_quality(
                    result, task):
                # iterate over all selectors that are behind the last selector contained in the evaluated candidate
                # according to the initial order
                index_of_last_selector = min(
                    task.search_space.index(candidate_description[-1]),
                    len(task.search_space) - 1)

                for sel in islice(task.search_space,
                                  index_of_last_selector + 1, None):
                    new_description = candidate_description + [sel]
                    heappush(queue, (-optimistic_estimate, new_description))
        result.sort(key=lambda x: x[0], reverse=True)
        return result
Ejemplo n.º 14
0
    def search_internal(self, task, result, sg):
        statistics = task.qf.calculate_statistics(sg)
        if not constraints_hold(task.constraints_monotone, sg, statistics,
                                task.data):
            return
        optimistic_estimate = task.qf.optimistic_estimate(sg, statistics)
        if not optimistic_estimate > ps.minimum_required_quality(result, task):
            return
        quality = task.qf.evaluate(sg, statistics)
        ps.add_if_required(result, sg, quality, task, statistics=statistics)

        if sg.depth < task.depth:
            for new_sg in self.operator.refinements(sg):
                self.search_internal(task, result, new_sg)
Ejemplo n.º 15
0
    def execute(self, task):
        # adapt beam width to the result set size if desired
        if self.beam_width_adaptive:
            self.beam_width = task.result_set_size

        # check if beam size is to small for result set
        if self.beam_width < task.result_set_size:
            raise RuntimeError(
                'Beam width in the beam search algorithm is smaller than the result set size!'
            )

        task.qf.calculate_constant_statistics(task)

        # init
        beam = [(0, ps.Conjunction([]),
                 task.qf.calculate_statistics(slice(None), task.data))]
        last_beam = None

        depth = 0
        while beam != last_beam and depth < task.depth:
            last_beam = beam.copy()
            for (_, last_sg, _) in last_beam:
                if not getattr(last_sg, 'visited', False):
                    setattr(last_sg, 'visited', True)
                    for sel in task.search_space:
                        # create a clone
                        new_selectors = list(last_sg.selectors)
                        if sel not in new_selectors:
                            new_selectors.append(sel)
                            sg = ps.Conjunction(new_selectors)
                            statistics = task.qf.calculate_statistics(
                                sg, task.data)
                            quality = task.qf.evaluate(sg, statistics)
                            ps.add_if_required(beam,
                                               sg,
                                               quality,
                                               task,
                                               check_for_duplicates=True,
                                               statistics=statistics)
            depth += 1


# TODO make sure there is no bug here
        result = beam[:task.result_set_size]
        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
Ejemplo n.º 16
0
    def execute(self, task):
        result = []
        queue = [(float("-inf"), ps.Conjunction([]))]

        operator = SpecializationOperator(data=task.data.drop(['target'], axis=1), n_bins=self.n_bins,
                                          max_features=self.max_features,
                                          intervals_only=self.intervals_only,
                                          binning=self.binning, specialization=self.specialization,
                                          search_space=task.search_space)
        task.qf.calculate_constant_statistics(task.data, task.target)
        while queue:
            q, old_description = heappop(queue)
            q = -q
            if not q > ps.minimum_required_quality(result, task):
                break
            for candidate_description in operator.refinements(old_description):
                score_eval = task.qf.evaluate(candidate_description, task.target, task.data, None)
                ps.add_if_required(result, candidate_description, score_eval, task)
                if len(candidate_description) < task.depth:
                    heappush(queue, (-score_eval, candidate_description))

        result.sort(key=lambda x: x[0], reverse=True)
        return ps.SubgroupDiscoveryResult(result, task)
Ejemplo n.º 17
0
    def execute(self, task):
        # adapt beam width to the result set size if desired
        if self.beam_width_adaptive:
            self.beam_width = task.result_set_size

        # check if beam size is to small for result set
        if self.beam_width < task.result_set_size:
            raise RuntimeError(
                'Beam width in the beam search algorithm is smaller than the result set size!'
            )

        # init
        beam = [(0, ps.Subgroup(task.target, []))]
        last_beam = None

        depth = 0
        while beam != last_beam and depth < task.depth:
            last_beam = beam.copy()
            for (_, last_sg) in last_beam:
                for sel in task.search_space:
                    # create a clone
                    new_selectors = list(
                        last_sg.subgroup_description.selectors)
                    if not sel in new_selectors:
                        new_selectors.append(sel)
                        sg = ps.Subgroup(task.target, new_selectors)
                        quality = task.qf.evaluate_from_dataset(task.data, sg)
                        ps.add_if_required(beam,
                                           sg,
                                           quality,
                                           task,
                                           check_for_duplicates=True)
            depth += 1

        result = beam[:task.result_set_size]
        result.sort(key=lambda x: x[0], reverse=True)
        return result
Ejemplo n.º 18
0
    def execute(self, task):
        measure_statistics_based = hasattr(
            task.qf, 'optimistic_estimate_from_statistics')
        result = []

        # init the first level
        next_level_candidates = []
        for sel in task.search_space:
            next_level_candidates.append(ps.Subgroup(task.target, [sel]))

        # level-wise search
        depth = 1
        while next_level_candidates:
            # check sgs from the last level
            promising_candidates = []
            for sg in next_level_candidates:
                if measure_statistics_based:
                    statistics = sg.get_base_statistics(task.data)
                    ps.add_if_required(
                        result, sg,
                        task.qf.evaluate_from_statistics(*statistics), task)
                    optimistic_estimate = task.qf.optimistic_estimate_from_statistics(
                        *statistics) if isinstance(
                            task.qf,
                            ps.BoundedInterestingnessMeasure) else float("inf")
                else:
                    ps.add_if_required(
                        result, sg,
                        task.qf.evaluate_from_dataset(task.data, sg), task)
                    optimistic_estimate = task.qf.optimistic_estimate_from_dataset(
                        task.data, sg) if isinstance(
                            task.qf,
                            ps.BoundedInterestingnessMeasure) else float("inf")

                # optimistic_estimate = task.qf.optimistic_estimate_from_dataset(task.data, sg)
                # if isinstance(task.qf, m.BoundedInterestingnessMeasure) else float("inf")
                # quality = task.qf.evaluate_from_dataset(task.data, sg)
                # ut.add_if_required (result, sg, quality, task)
                if optimistic_estimate >= ps.minimum_required_quality(
                        result, task):
                    promising_candidates.append(
                        sg.subgroup_description.selectors)

            if depth == task.depth:
                break

            # generate candidates next level
            next_level_candidates = []
            for i, sg1 in enumerate(promising_candidates):
                for j, sg2 in enumerate(promising_candidates):
                    if i < j and sg1[:-1] == sg2[:-1]:
                        candidate = list(sg1) + [sg2[-1]]
                        # check if ALL generalizations are contained in promising_candidates
                        generalization_descriptions = [[
                            x for x in candidate if x != sel
                        ] for sel in candidate]
                        if all(g in promising_candidates
                               for g in generalization_descriptions):
                            next_level_candidates.append(
                                ps.Subgroup(task.target, candidate))
            depth = depth + 1

        result.sort(key=lambda x: x[0], reverse=True)
        return result