コード例 #1
0
ファイル: irep.py プロジェクト: jbdatascience/wittgenstein
    def _grow_ruleset(self, pos_df, neg_df, prune_size, random_state=None, verbosity=0):
        """ Grow a Ruleset with (optional) pruning. """

        ruleset = Ruleset()
        ruleset._set_possible_conds(pos_df, neg_df)

        if not prune_size: prune_size = 0 # If not pruning, use all the data for growing
        pos_remaining = pos_df.copy()
        neg_remaining = neg_df.copy()
        self.rules = []
        while len(pos_remaining) > 0: # Stop adding disjunctions if there are no more positive examples to cover
            # Grow-prune split remaining uncovered examples (if applicable)
            pos_growset, pos_pruneset = base.df_shuffled_split(pos_remaining, (1-prune_size), random_state=random_state)
            neg_growset, neg_pruneset = base.df_shuffled_split(neg_remaining, (1-prune_size), random_state=random_state)
            if self.verbosity>=2:
                print(f'pos_growset {len(pos_growset)} pos_pruneset {len(pos_pruneset)}')
                print(f'neg_growset {len(neg_growset)} neg_pruneset {len(neg_pruneset)}')
                if not prune_size: print(f'(pruning is turned off)')

            # Grow Rule
            grown_rule = base.grow_rule(pos_growset, neg_growset, ruleset.possible_conds, verbosity=self.verbosity)

            # If not pruning, add Rule to Ruleset and drop only the covered positive examples
            if not prune_size:
                ruleset.add(grown_rule)
                if self.verbosity>=2:
                    print(f"updated ruleset: {ruleset.truncstr(direction='right')}")
                    print()
                rule_covers_pos = grown_rule.covers(pos_remaining)
                pos_remaining = pos_remaining.drop(rule_covers_pos.index, axis=0)
                if self.verbosity>=3:
                    print(f'examples remaining: {len(pos_remaining)} pos, {len(neg_remaining)} neg')
                    print()

            # If pruning, prune Rule, assess if it's time to stop, and drop all covered examples
            else:
                pruned_rule = base.prune_rule(grown_rule, _IREP_prune_metric, pos_pruneset, neg_pruneset, verbosity=self.verbosity)

                # Stop if the Rule is bad
                prune_precision = base.precision(pruned_rule, pos_pruneset, neg_pruneset)
                if not prune_precision or prune_precision < .50:
                    break
                # Otherwise, add new Rule, remove covered examples, and continue
                else:
                    ruleset.add(pruned_rule)
                    if self.verbosity>=2:
                        print(f"updated ruleset: {ruleset.truncstr(direction='right')}")
                        print()
                    pos_remaining, neg_remaining = base.rm_covered(pruned_rule, pos_remaining, neg_remaining)
                    if self.verbosity>=3:
                        print(f'examples remaining: {len(pos_remaining)} pos, {len(neg_remaining)} neg')
                        print()
        return ruleset
コード例 #2
0
ファイル: ripper.py プロジェクト: jbdatascience/wittgenstein
    def fit(self,
            df,
            y=None,
            class_feat=None,
            pos_class=None,
            n_discretize_bins=10,
            random_state=None):
        """ Fit a Ruleset model using a training DataFrame.

            args:
                df <DataFrame>: categorical training dataset
                y: <iterable>: class labels corresponding to df rows. Parameter y or class_feat (see next) must be provided.
                class_feat: column name of class feature (Use if class feature is still in df.)

                pos_class (optional): name of positive class. If not provided, defaults to class of first training example.
                n_discretize_bins (optional): Fit apparent numeric attributes into a maximum of n_discretize_bins discrete bins, inclusive on upper part of range.
                                              Setting to smaller values can improve training speed.
                                              Pass None to disable auto-discretization and treat values as categorical. (default=10)
                random_state: (optional) random state to allow for repeatable results
        """

        ################
        # Stage 0: Setup
        ################

        # Set up trainset, set class feature name, and set pos class name
        df, self.class_feat, self.pos_class = base.trainset_classfeat_posclass(
            df, y=y, class_feat=class_feat, pos_class=pos_class)

        # Precalculate rule df lookup
        #self._set_theory_dl_lookup(df, verbosity=self.verbosity)

        # Anything to discretize?
        df, self.bin_transformer_ = bin_df(df,
                                           n_discretize_bins=n_discretize_bins,
                                           ignore_feats=[self.class_feat],
                                           verbosity=self.verbosity)

        # Split df into pos, neg classes
        pos_df, neg_df = base.pos_neg_split(df, self.class_feat,
                                            self.pos_class)
        pos_df = pos_df.drop(self.class_feat, axis=1)
        neg_df = neg_df.drop(self.class_feat, axis=1)

        # Collect possible conds
        self._set_possible_conds(df)

        ###############################
        # Stage 1: Grow initial Ruleset
        ###############################

        self.ruleset_ = Ruleset()
        self.ruleset_ = self._grow_ruleset(pos_df,
                                           neg_df,
                                           prune_size=self.prune_size,
                                           dl_allowance=self.dl_allowance,
                                           random_state=random_state)
        if self.verbosity >= 1:
            print()
            print('GREW INITIAL RULESET:')
            self.ruleset_.out_pretty()
            print()

        ###########################
        # Stage 2: Optimize Ruleset
        ###########################

        for iter in range(1, self.k + 1):
            # Create new but reproducible random_state (if applicable)
            iter_random_state = random_state + 100 if random_state is not None else None
            # Run optimization iteration
            if self.verbosity >= 1:
                print(f'optimization run {iter} of {self.k}')
            newset = self._optimize_ruleset(self.ruleset_,
                                            pos_df,
                                            neg_df,
                                            prune_size=self.prune_size,
                                            random_state=iter_random_state)

            if self.verbosity >= 1:
                print()
                print('OPTIMIZED RULESET:')
                if self.verbosity >= 2:
                    print(
                        f'iteration {iter} of {self.k}\n modified rules {[i for i in range(len(self.ruleset_.rules)) if self.ruleset_.rules[i]!= newset.rules[i]]}'
                    )
                newset.out_pretty()
                print()

            if iter != self.k and self.ruleset_ == newset:
                if self.verbosity >= 1:
                    print('No changes were made. Halting optimization.')
                break
            else:
                self.ruleset_ = newset

        #############################################
        # Stage 3: Cover any last remaining positives
        #############################################

        pos_remaining, neg_remaining = base.pos_neg_split(
            df, self.class_feat, self.pos_class)
        pos_remaining = pos_remaining.drop(self.class_feat, axis=1)
        neg_remaining = neg_remaining.drop(self.class_feat, axis=1)
        pos_remaining, neg_remaining = base.rm_covered(self.ruleset_,
                                                       pos_remaining,
                                                       neg_remaining)
        if len(pos_remaining) >= 1:
            if self.verbosity >= 2:
                print(f'{len(pos_remaining)} pos left. Growing final rules...')
            newset = self._grow_ruleset(pos_remaining,
                                        neg_remaining,
                                        initial_ruleset=self.ruleset_,
                                        prune_size=self.prune_size,
                                        dl_allowance=self.dl_allowance,
                                        random_state=random_state)
            if self.verbosity >= 1:
                print('GREW FINAL RULES')
                newset.out_pretty()
                print()
            self.ruleset_ = newset
        else:
            if self.verbosity >= 1: print('All pos covered\n')

        #################################################
        # Stage 4: Remove any rules that don't improve dl
        #################################################

        if self.verbosity >= 2: print('Optimizing dl...')
        mdl_subset, _ = _rs_total_bits(self.ruleset_,
                                       self.ruleset_.possible_conds,
                                       pos_df,
                                       neg_df,
                                       bestsubset_dl=True,
                                       ret_bestsubset=True,
                                       verbosity=self.verbosity)
        self.ruleset_ = mdl_subset
        if self.verbosity >= 1:
            print('FINAL RULESET:')
            self.ruleset_.out_pretty()
            print()
コード例 #3
0
ファイル: ripper.py プロジェクト: jbdatascience/wittgenstein
    def _grow_ruleset(self,
                      pos_df,
                      neg_df,
                      prune_size,
                      dl_allowance,
                      initial_ruleset=None,
                      random_state=None):
        """ Grow a Ruleset with pruning. """
        pos_remaining = pos_df.copy()
        neg_remaining = neg_df.copy()

        if initial_ruleset is None:
            ruleset = Ruleset()
            ruleset._set_possible_conds(pos_df, neg_df)
        else:
            ruleset = copy.deepcopy(initial_ruleset)

        ruleset_dl = None
        mdl = None  # Minimum encountered description length (in bits)
        dl_diff = 0
        if self.verbosity >= 2:
            print('growing ruleset...')
            print()
        while len(pos_remaining) > 0 and dl_diff <= self.dl_allowance:
            # Grow-prune split remaining uncovered examples
            pos_growset, pos_pruneset = base.df_shuffled_split(
                pos_remaining, (1 - prune_size), random_state=random_state)
            neg_growset, neg_pruneset = base.df_shuffled_split(
                neg_remaining, (1 - prune_size), random_state=random_state)
            if self.verbosity >= 2:
                print(
                    f'pos_growset {len(pos_growset)} pos_pruneset {len(pos_pruneset)}'
                )
                print(
                    f'neg_growset {len(neg_growset)} neg_pruneset {len(neg_pruneset)}'
                )
            if len(pos_growset) == 0:
                break  # Probably safe, but a little dicey to only check pos_growset.

            # Grow Rule
            grown_rule = base.grow_rule(pos_growset,
                                        neg_growset,
                                        ruleset.possible_conds,
                                        verbosity=self.verbosity)
            if grown_rule.isempty():
                break  # Generated an empty rule b/c no good conds exist

            # Prune Rule
            pruned_rule = base.prune_rule(grown_rule,
                                          _RIPPER_growphase_prune_metric,
                                          pos_pruneset,
                                          neg_pruneset,
                                          verbosity=self.verbosity)

            # Add rule; calculate new description length
            ruleset.add(
                pruned_rule
            )  # Unlike IREP, IREP*/RIPPER stopping condition is inclusive: "After each rule is added, the total description length of the rule set and examples is computed."
            if self.verbosity >= 2:
                print(
                    f"updated ruleset: {ruleset.truncstr(direction='right')}")
                print()

            if ruleset_dl is None:  # First Rule to be added
                rule_dl = _r_theory_bits(pruned_rule,
                                         ruleset.possible_conds,
                                         verbosity=self.verbosity)
                theory_dl = rule_dl
                data_dl = _exceptions_bits(ruleset,
                                           pos_df,
                                           neg_df,
                                           verbosity=self.verbosity)
                ruleset_dl = theory_dl + data_dl
                mdl = ruleset_dl
            else:
                rule_dl = _r_theory_bits(pruned_rule,
                                         ruleset.possible_conds,
                                         verbosity=self.verbosity)
                theory_dl += rule_dl
                data_dl = _exceptions_bits(ruleset,
                                           pos_df,
                                           neg_df,
                                           verbosity=self.verbosity)
                ruleset_dl = theory_dl + data_dl
                dl_diff = ruleset_dl - mdl

            if self.verbosity >= 3:
                print(f'rule dl: {rnd(rule_dl)}')
                print(f'updated theory dl: {rnd(theory_dl)}')
                print(f'exceptions: {rnd(data_dl)}')
                print(f'total dl: {rnd(ruleset_dl)}')
                if dl_diff <= self.dl_allowance:
                    print(
                        f'mdl {rnd(mdl)} (diff {rnd(dl_diff)} <= {rnd(self.dl_allowance)})'
                    )
                else:
                    print(
                        f'mdl {rnd(mdl)} dl-halt: diff {rnd(dl_diff)} exceeds allowance ({rnd(self.dl_allowance)})'
                    )

            mdl = ruleset_dl if ruleset_dl < mdl else mdl

            # Remove covered examples
            pos_remaining, neg_remaining = base.rm_covered(
                pruned_rule, pos_remaining, neg_remaining)

            if self.verbosity >= 3:
                print(
                    f'examples remaining: {len(pos_remaining)} pos, {len(neg_remaining)} neg'
                )
                print()
        return ruleset
コード例 #4
0
ファイル: ripper.py プロジェクト: jbdatascience/wittgenstein
    def _optimize_ruleset(self,
                          ruleset,
                          pos_df,
                          neg_df,
                          prune_size,
                          random_state=None):
        """ Optimization phase. """

        if self.verbosity >= 2:
            print('optimizing ruleset...')
            print()

        pos_remaining = pos_df.copy()
        neg_remaining = neg_df.copy()
        original_ruleset = copy.deepcopy(ruleset)
        if self.verbosity >= 4:
            print('calculate original ruleset potential dl...')
        original_dl = _rs_total_bits(original_ruleset,
                                     original_ruleset.possible_conds,
                                     pos_df,
                                     neg_df,
                                     bestsubset_dl=True,
                                     verbosity=self.verbosity)
        if self.verbosity >= 3:
            print(f'original ruleset potential dl: {rnd(original_dl)}')
            print()
        new_ruleset = copy.deepcopy(ruleset)

        for i, rule in enumerate(original_ruleset.rules):
            pos_growset, pos_pruneset = base.df_shuffled_split(
                pos_remaining, (1 - prune_size), random_state=random_state)
            neg_growset, neg_pruneset = base.df_shuffled_split(
                neg_remaining, (1 - prune_size), random_state=random_state)
            if len(pos_growset) == 0:
                break  # Possible where optimization run > 1

            # Create alternative rules
            if self.verbosity >= 4:
                print(
                    f'creating replacement for {i} of {len(original_ruleset.rules)}: {ruleset.rules[i]}'
                )
            g_replacement = base.grow_rule(pos_growset,
                                           neg_growset,
                                           original_ruleset.possible_conds,
                                           initial_rule=Rule(),
                                           verbosity=self.verbosity)
            replacement_ruleset = Ruleset(
                base.i_replaced(original_ruleset.rules, i, g_replacement))
            pr_replacement = base.prune_rule(
                g_replacement,
                _RIPPER_optimization_prune_metric,
                pos_pruneset,
                neg_pruneset,
                eval_index_on_ruleset=(i, replacement_ruleset),
                verbosity=self.verbosity)
            replacement_ruleset = Ruleset(
                base.i_replaced(original_ruleset.rules, i, pr_replacement))
            if self.verbosity >= 3:
                print(f'grew replacement {g_replacement}')
                print(f'pruned replacement is {pr_replacement}')

            if self.verbosity >= 3:
                print(
                    f'creating revision for {i} of {len(original_ruleset.rules)}: {ruleset.rules[i]}'
                )
            g_revision = base.grow_rule(pos_growset,
                                        neg_growset,
                                        original_ruleset.possible_conds,
                                        initial_rule=ruleset.rules[i],
                                        verbosity=self.verbosity)
            revision_ruleset = Ruleset(
                base.i_replaced(original_ruleset.rules, i, g_revision))
            pr_revision = base.prune_rule(
                g_revision,
                _RIPPER_optimization_prune_metric,
                pos_pruneset,
                neg_pruneset,
                eval_index_on_ruleset=(i, revision_ruleset),
                verbosity=self.verbosity)
            revision_ruleset = Ruleset(
                base.i_replaced(original_ruleset.rules, i, pr_revision))
            if self.verbosity >= 3:
                print(f'grew revision {g_replacement}')
                print(f'pruned revision is {pr_replacement}')
                print()

            # Calculate alternative Rulesets' respective lowest potential dls to identify the best version
            if self.verbosity >= 3:
                print(
                    f'calculate potential dl for ds with replacement {pr_replacement}'
                )
            replacement_dl = _rs_total_bits(replacement_ruleset, original_ruleset.possible_conds, pos_df, neg_df, bestsubset_dl=True, verbosity=self.verbosity)\
                             if pr_replacement!=rule else original_dl
            if self.verbosity >= 3:
                print(
                    f'calculate potential dl for ds with revision {pr_revision}'
                )
            revision_dl = _rs_total_bits(revision_ruleset, original_ruleset.possible_conds, pos_df, neg_df, bestsubset_dl=True, verbosity=self.verbosity)\
                          if pr_revision!=rule else original_dl
            best_rule = [rule, pr_replacement, pr_revision][base.argmin(
                [original_dl, replacement_dl, revision_dl])]

            if self.verbosity >= 2:
                print(f'\nrule {i+1} of {len(original_ruleset.rules)}')
                rep_str = pr_replacement.__str__(
                ) if pr_replacement != rule else 'unchanged'
                rev_str = pr_revision.__str__(
                ) if pr_revision != rule else 'unchanged'
                best_str = best_rule.__str__(
                ) if best_rule != rule else 'unchanged'
                if self.verbosity == 2:
                    print(f'original: {rule}')
                    print(f'replacement: {rep_str}')
                    print(f'revision: {rev_str}')
                    print(f'*best: {best_str}')
                    print()
                else:
                    print(f'original: {rule}) | {rnd(original_dl)} bits')
                    print(
                        f'replacement: {rep_str} | {rnd(replacement_dl)} bits')
                    print(f'revision: {rev_str} | {rnd(revision_dl)} bits')
                    print(
                        f'*best: {best_str} | {rnd(min([replacement_dl, revision_dl, original_dl]))} bits'
                    )
                    print()
            new_ruleset.rules[i] = best_rule

            # Remove covered examples
            pos_remaining, neg_remaining = base.rm_covered(
                rule, pos_remaining, neg_remaining)
            if self.verbosity >= 3:
                print(
                    f'examples remaining: {len(pos_remaining)} pos, {len(neg_remaining)} neg'
                )
                print()

            # If there are no pos data remaining to train optimization (could happen if optimization run >1), keep remaining rules the same
            if len(pos_remaining) == 0: break

        return new_ruleset
コード例 #5
0
    def fit(self,
            df,
            y=None,
            class_feat=None,
            pos_class=None,
            n_discretize_bins=None,
            random_state=None):
        """ Fit a Ruleset model using a training DataFrame.

            args:
                df <DataFrame>: categorical training dataset
                y: <iterable>: class labels corresponding to df rows. Parameter y or class_feat (see next) must be provided.
                class_feat: column name of class feature (Use if class feature is still in df.)

                pos_class (optional): name of positive class. If not provided, defaults to class of first training example.
                n_discretize_bins (optional): try to fit apparent numeric attributes into n_discretize_bins discrete bins.
                                              Pass None to disable auto-discretization and treat values as categorical. (default=None)
                random_state: (optional) random state to allow for repeatable results
        """

        ################
        # Stage 0: Setup
        ################

        # Set up trainset, set class feature name, and set pos class name
        df, self.class_feat, self.pos_class = base.trainset_classfeat_posclass(
            df, y=y, class_feat=class_feat, pos_class=pos_class)

        # Precalculate rule df lookup
        #self._set_theory_dl_lookup(df, verbosity=self.verbosity)

        # Anything to discretize?
        numeric_feats = base.find_numeric_feats(df,
                                                min_unique=n_discretize_bins,
                                                ignore_feats=[self.class_feat])
        if numeric_feats:
            if n_discretize_bins is not None:
                if self.verbosity == 1:
                    print(f'binning data...\n')
                elif self.verbosity >= 2:
                    print(f'binning features {numeric_feats}...')
                self.bin_transformer_ = fit_bins(
                    df,
                    n_bins=n_discretize_bins,
                    output=False,
                    ignore_feats=[self.class_feat],
                    verbosity=self.verbosity)
                binned_df = bin_transform(df, self.bin_transformer_)
            else:
                n_unique_values = sum(
                    [len(u) for u in [df[f].unique() for f in numeric_feats]])
                warnings.warn(
                    f'Optional param n_discretize_bins=None, but there are apparent numeric features: {numeric_feats}. \n Treating {n_unique_values} numeric values as nominal',
                    RuntimeWarning)
                binned_df = None
        else:
            binned_df = None

        # Split df into pos, neg classes
        pos_df, neg_df = base.pos_neg_split(
            df, self.class_feat,
            self.pos_class) if binned_df is None else base.pos_neg_split(
                binned_df, self.class_feat, self.pos_class)
        pos_df = pos_df.drop(self.class_feat, axis=1)
        neg_df = neg_df.drop(self.class_feat, axis=1)

        # Collect possible conds
        self._set_possible_conds(df)

        ###############################
        # Stage 1: Grow initial Ruleset
        ###############################

        self.ruleset_ = Ruleset()
        self.ruleset_ = self._grow_ruleset(pos_df,
                                           neg_df,
                                           prune_size=self.prune_size,
                                           dl_allowance=self.dl_allowance,
                                           random_state=random_state)
        if self.verbosity >= 1:
            print()
            print('GREW INITIAL RULESET:')
            self.ruleset_.out_pretty()
            print()

        ###########################
        # Stage 2: Optimize Ruleset
        ###########################

        for iter in range(self.k):
            # Create new but reproducible random_state (if applicable)
            iter_random_state = random_state + 100 if random_state is not None else None
            # Run optimization iteration
            if self.verbosity >= 1:
                print(f'optimization run {iter+1} of {self.k}')
            newset = self._optimize_ruleset(self.ruleset_,
                                            pos_df,
                                            neg_df,
                                            prune_size=self.prune_size,
                                            random_state=iter_random_state)

            if self.verbosity >= 1:
                print()
                print('OPTIMIZED RULESET:')
                if self.verbosity >= 2:
                    print(
                        f'iteration {iter+1} of {self.k}\n modified rules {[i for i in range(len(self.ruleset_.rules)) if self.ruleset_.rules[i]!= newset.rules[i]]}'
                    )
                newset.out_pretty()
                print()
            self.ruleset_ = newset

        #############################################
        # Stage 3: Cover any last remaining positives
        #############################################

        pos_remaining, neg_remaining = base.pos_neg_split(
            df, self.class_feat, self.pos_class)
        pos_remaining = pos_remaining.drop(self.class_feat, axis=1)
        neg_remaining = neg_remaining.drop(self.class_feat, axis=1)
        pos_remaining, neg_remaining = base.rm_covered(self.ruleset_,
                                                       pos_remaining,
                                                       neg_remaining)
        if len(pos_remaining) >= 1:
            if self.verbosity >= 2:
                print(f'{len(pos_remaining)} pos left. Growing final rules...')
            newset = self._grow_ruleset(pos_remaining,
                                        neg_remaining,
                                        initial_ruleset=self.ruleset_,
                                        prune_size=self.prune_size,
                                        dl_allowance=self.dl_allowance,
                                        random_state=random_state)
            if self.verbosity >= 1:
                print('GREW FINAL RULES')
                newset.out_pretty()
                print()
            self.ruleset_ = newset
        else:
            if self.verbosity >= 1: print('All pos covered\n')

        #################################################
        # Stage 4: Remove any rules that don't improve dl
        #################################################

        if self.verbosity >= 2: print('Optimizing dl...')
        mdl_subset, _ = _rs_total_bits(self.ruleset_,
                                       self.ruleset_.possible_conds,
                                       pos_df,
                                       neg_df,
                                       bestsubset_dl=True,
                                       ret_bestsubset=True,
                                       verbosity=self.verbosity)
        self.ruleset_ = mdl_subset
        if self.verbosity >= 1:
            print('FINAL RULESET:')
            self.ruleset_.out_pretty()
            print()