Esempio n. 1
0
    def find_rules_and_measure_progress(self, X, Y, W, target_class,
                                        base_rules, domain, progress_amount):
        """
        The top-level control procedure of the separate-and-conquer
        algorithm. For given data and target class (may be None), return
        a list of rules which all must strictly adhere to the
        requirements of rule finder's validators.

        To induce decision lists (ordered rules), set target class to
        None. To induce rule sets (unordered rules), learn rules for
        each class individually, in regard to the original learning
        data.

        Parameters
        ----------
        X, Y, W : ndarray
            Learning data.
        target_class : int
            Index of the class to model.
        base_rules : list of Rule
            An optional list of initial rules to constrain the search.
        domain : Orange.data.domain.Domain
            Data domain, used to calculate class distributions.
        progress_amount: int, percentage
            Part of the learning algorithm covered by this function
            call.

        Returns
        -------
        rule_list : list of Rule
            Induced rules.
        """
        initial_class_dist = get_dist(Y, W, domain)
        rule_list = []

        # while data allows, continuously find new rules,
        # break the loop if min. requirements cannot be met,
        # after finding a rule, remove the instances covered
        while not self.data_stopping(X, Y, W, target_class):

            # remember the distribution to correctly update progress
            temp_class_dist = get_dist(Y, W, domain)

            # generate a new rule that has not been seen before
            new_rule = self.rule_finder(X, Y, W, target_class, base_rules,
                                        domain, initial_class_dist, rule_list)

            # None when no new, unique rules that pass
            # the general requirements can be found
            if new_rule is None or self.rule_stopping(new_rule):
                break

            # exclusive or weighted
            X, Y, W = self.cover_and_remove(X, Y, W, new_rule)
            rule_list.append(new_rule)

            # update progress
            if self.progress_advance_callback is not None:
                progress = (((temp_class_dist[target_class] -
                              get_dist(Y, W, domain)[target_class])
                             / initial_class_dist[target_class]
                             * progress_amount) if target_class is not None else
                            ((temp_class_dist - get_dist(Y, W, domain)).sum()
                             / initial_class_dist.sum() * progress_amount))
                self.progress_advance_callback(progress)

        return rule_list
Esempio n. 2
0
    def find_rules_and_measure_progress(self, X, Y, W, target_class,
                                        base_rules, domain, progress_amount):
        """
        The top-level control procedure of the separate-and-conquer
        algorithm. For given data and target class (may be None), return
        a list of rules which all must strictly adhere to the
        requirements of rule finder's validators.

        To induce decision lists (ordered rules), set target class to
        None. To induce rule sets (unordered rules), learn rules for
        each class individually, in regard to the original learning
        data.

        Parameters
        ----------
        X, Y, W : ndarray
            Learning data.
        target_class : int
            Index of the class to model.
        base_rules : list of Rule
            An optional list of initial rules to constrain the search.
        domain : Orange.data.domain.Domain
            Data domain, used to calculate class distributions.
        progress_amount: int, percentage
            Part of the learning algorithm covered by this function
            call.

        Returns
        -------
        rule_list : list of Rule
            Induced rules.
        """
        initial_class_dist = get_dist(Y, W, domain)
        rule_list = []

        # while data allows, continuously find new rules,
        # break the loop if min. requirements cannot be met,
        # after finding a rule, remove the instances covered
        while not self.data_stopping(X, Y, W, target_class):

            # remember the distribution to correctly update progress
            temp_class_dist = get_dist(Y, W, domain)

            # generate a new rule that has not been seen before
            new_rule = self.rule_finder(X, Y, W, target_class, base_rules,
                                        domain, initial_class_dist, rule_list)

            # None when no new, unique rules that pass
            # the general requirements can be found
            if new_rule is None or self.rule_stopping(new_rule):
                break

            # exclusive or weighted
            X, Y, W = self.cover_and_remove(X, Y, W, new_rule)
            rule_list.append(new_rule)

            # update progress
            if self.progress_advance_callback is not None:
                progress = (((temp_class_dist[target_class] -
                              get_dist(Y, W, domain)[target_class]) /
                             initial_class_dist[target_class] *
                             progress_amount) if target_class is not None else
                            ((temp_class_dist - get_dist(Y, W, domain)).sum() /
                             initial_class_dist.sum() * progress_amount))
                self.progress_advance_callback(progress)

        return rule_list
Esempio n. 3
0
    def fit_storage(self, data):
        X, Y, W = data.X, data.Y, data.W if data.W else None
        Y = Y.astype(dtype=int)

        # estimate extreme value distributions (if necessary)
        if self.evc and self.to_calc_evds:
            self.calculate_evds(data)
        if self.evc and not self.evds:
            warn("""Extreme value distributions not set.
                    Need to calculate them first. """)
            self.calculate_evds(data)

        prior = get_dist(Y, W, self.domain)
        if not prior.sum():
            return self.classifier(domain=self.domain, rule_list=[])

        # create initial star
        star = self.create_initial_star(X, Y, W, prior)
        # use visited to prevent learning the same rule all over again
        visited = set(
            (r.covered_examples.tostring(), r.target_class) for r in star)
        # update best rule
        bestr = np.empty(X.shape[0], dtype=object)
        bestq = np.zeros(X.shape[0], dtype=float)
        for r in star:
            if self.rule_validator.validate_rule(r):
                self.update_best(bestr, bestq, r, Y)
        # loop until star has rules
        self.inter_rules = []  # store intermediate rules
        rlength = 0
        while star:
            rlength += 1
            # specialize each rule in star
            new_star = []
            for r in star:
                if r.curr_class_dist[
                        r.target_class] == r.curr_class_dist.sum():
                    continue
                # refine rule
                rules = self.rule_finder.search_strategy.refine_rule(
                    X, Y, W, r)
                # work refined rules
                for nr in rules:
                    nr.default_rule = nr.parent_rule.default_rule
                    nr.do_evaluate()
                    rkey = (nr.covered_examples.tostring(), nr.target_class)
                    if (rkey not in visited and self.rule_finder.
                            general_validator.validate_rule(nr)
                            and nr.quality >= nr.parent_rule.quality):
                        # rule is consistent with basic conditions
                        # can it be new best?
                        if self.rule_validator.validate_rule(nr):
                            self.update_best(bestr, bestq, nr, Y)
                        # can it be further specialized?
                        if (self.specialization_validator.validate_rule(nr)
                                and nr.length < self.max_rule_length):
                            new_star.append(nr)
                    visited.add(rkey)
            # assign a rank to each rule in new star
            nrules = len(new_star)
            inst_quality = np.zeros((X.shape[0], nrules))
            for ri, r in enumerate(new_star):
                if self.target_instances:  # learn rules for specific instances only
                    c2 = np.zeros(r.covered_examples.shape, dtype=bool)
                    c2[self.target_instances] = 1
                    cov = np.where(c2 & r.covered_examples
                                   & (r.target_class == Y))[0]
                else:
                    cov = np.where(r.covered_examples
                                   & (r.target_class == Y))[0]
                inst_quality[cov, ri] = r.quality
            sel_rules = -(min(nrules, 5))
            queues = np.argsort(inst_quality)[:, sel_rules:]

            # create new star from queues
            new_star_set = set()
            index = -1
            while len(new_star_set) < self.width:
                if index < sel_rules:
                    break
                # pop one rule from each queue and put into a temporary counter
                cnt = Counter()
                for qi, q in enumerate(queues):
                    ri = q[index]
                    if inst_quality[qi, ri] > 0:
                        cnt[ri] += 1
                if not cnt:
                    break
                elts = cnt.most_common()
                for e, _ in elts:
                    if e in new_star_set: continue
                    new_star_set.add(e)
                    if len(new_star_set) >= self.width:
                        break
                index -= 1
            star = [new_star[ri] for ri in new_star_set]
            if self.store_intermediate_rules:
                rl = []
                vis = set()
                for ri, r in enumerate(bestr):
                    if r is None:
                        continue
                    self.add_rule(rl, vis, r)
                self.inter_rules.append(rl)

        # select best rules
        rule_list = []
        visited = set()
        for ri, r in enumerate(bestr):
            # add r
            if r is None:
                continue
            self.add_rule(rule_list, visited, r)
            if not hasattr(r, "best_instance"):
                r.best_instance = [data[ri]]
            else:
                r.best_instance.append(data[ri])
            if self.add_sub_rules:
                # create parent rule
                pr = self.create_parent(r, X, Y, W)
                while pr is not None:
                    # add parent rule
                    self.add_rule(rule_list, visited, pr)
                    pr = self.create_parent(pr, X, Y, W)
        rule_list = sorted(rule_list, key=lambda r: -r.quality)
        if self.min_unique_examples > 1:
            filter_rules = []
            covered = np.zeros(X.shape[0], dtype=bool)
            for r in rule_list:
                if (~covered &
                        r.covered_examples).sum() >= self.min_unique_examples:
                    filter_rules.append(r)
                    covered |= r.covered_examples
            rule_list = filter_rules
        return self.classifier(domain=self.domain, rule_list=rule_list)