def _grow_ruleset(self, pos_df, neg_df, prune_size, random_state=None, verbosity=0): """ Grow a Ruleset with (optional) pruning. """ ruleset = Ruleset() ruleset._set_possible_conds(pos_df, neg_df) if not prune_size: prune_size = 0 # If not pruning, use all the data for growing pos_remaining = pos_df.copy() neg_remaining = neg_df.copy() self.rules = [] while len(pos_remaining) > 0: # Stop adding disjunctions if there are no more positive examples to cover # Grow-prune split remaining uncovered examples (if applicable) pos_growset, pos_pruneset = base.df_shuffled_split(pos_remaining, (1-prune_size), random_state=random_state) neg_growset, neg_pruneset = base.df_shuffled_split(neg_remaining, (1-prune_size), random_state=random_state) if self.verbosity>=2: print(f'pos_growset {len(pos_growset)} pos_pruneset {len(pos_pruneset)}') print(f'neg_growset {len(neg_growset)} neg_pruneset {len(neg_pruneset)}') if not prune_size: print(f'(pruning is turned off)') # Grow Rule grown_rule = base.grow_rule(pos_growset, neg_growset, ruleset.possible_conds, verbosity=self.verbosity) # If not pruning, add Rule to Ruleset and drop only the covered positive examples if not prune_size: ruleset.add(grown_rule) if self.verbosity>=2: print(f"updated ruleset: {ruleset.truncstr(direction='right')}") print() rule_covers_pos = grown_rule.covers(pos_remaining) pos_remaining = pos_remaining.drop(rule_covers_pos.index, axis=0) if self.verbosity>=3: print(f'examples remaining: {len(pos_remaining)} pos, {len(neg_remaining)} neg') print() # If pruning, prune Rule, assess if it's time to stop, and drop all covered examples else: pruned_rule = base.prune_rule(grown_rule, _IREP_prune_metric, pos_pruneset, neg_pruneset, verbosity=self.verbosity) # Stop if the Rule is bad prune_precision = base.precision(pruned_rule, pos_pruneset, neg_pruneset) if not prune_precision or prune_precision < .50: break # Otherwise, add new Rule, remove covered examples, and continue else: ruleset.add(pruned_rule) if self.verbosity>=2: print(f"updated ruleset: {ruleset.truncstr(direction='right')}") print() pos_remaining, neg_remaining = base.rm_covered(pruned_rule, pos_remaining, neg_remaining) if self.verbosity>=3: print(f'examples remaining: {len(pos_remaining)} pos, {len(neg_remaining)} neg') print() return ruleset
def fit(self, df, y=None, class_feat=None, pos_class=None, n_discretize_bins=10, random_state=None): """ Fit a Ruleset model using a training DataFrame. args: df <DataFrame>: categorical training dataset y: <iterable>: class labels corresponding to df rows. Parameter y or class_feat (see next) must be provided. class_feat: column name of class feature (Use if class feature is still in df.) pos_class (optional): name of positive class. If not provided, defaults to class of first training example. n_discretize_bins (optional): Fit apparent numeric attributes into a maximum of n_discretize_bins discrete bins, inclusive on upper part of range. Setting to smaller values can improve training speed. Pass None to disable auto-discretization and treat values as categorical. (default=10) random_state: (optional) random state to allow for repeatable results """ ################ # Stage 0: Setup ################ # Set up trainset, set class feature name, and set pos class name df, self.class_feat, self.pos_class = base.trainset_classfeat_posclass( df, y=y, class_feat=class_feat, pos_class=pos_class) # Precalculate rule df lookup #self._set_theory_dl_lookup(df, verbosity=self.verbosity) # Anything to discretize? df, self.bin_transformer_ = bin_df(df, n_discretize_bins=n_discretize_bins, ignore_feats=[self.class_feat], verbosity=self.verbosity) # Split df into pos, neg classes pos_df, neg_df = base.pos_neg_split(df, self.class_feat, self.pos_class) pos_df = pos_df.drop(self.class_feat, axis=1) neg_df = neg_df.drop(self.class_feat, axis=1) # Collect possible conds self._set_possible_conds(df) ############################### # Stage 1: Grow initial Ruleset ############################### self.ruleset_ = Ruleset() self.ruleset_ = self._grow_ruleset(pos_df, neg_df, prune_size=self.prune_size, dl_allowance=self.dl_allowance, random_state=random_state) if self.verbosity >= 1: print() print('GREW INITIAL RULESET:') self.ruleset_.out_pretty() print() ########################### # Stage 2: Optimize Ruleset ########################### for iter in range(1, self.k + 1): # Create new but reproducible random_state (if applicable) iter_random_state = random_state + 100 if random_state is not None else None # Run optimization iteration if self.verbosity >= 1: print(f'optimization run {iter} of {self.k}') newset = self._optimize_ruleset(self.ruleset_, pos_df, neg_df, prune_size=self.prune_size, random_state=iter_random_state) if self.verbosity >= 1: print() print('OPTIMIZED RULESET:') if self.verbosity >= 2: print( f'iteration {iter} of {self.k}\n modified rules {[i for i in range(len(self.ruleset_.rules)) if self.ruleset_.rules[i]!= newset.rules[i]]}' ) newset.out_pretty() print() if iter != self.k and self.ruleset_ == newset: if self.verbosity >= 1: print('No changes were made. Halting optimization.') break else: self.ruleset_ = newset ############################################# # Stage 3: Cover any last remaining positives ############################################# pos_remaining, neg_remaining = base.pos_neg_split( df, self.class_feat, self.pos_class) pos_remaining = pos_remaining.drop(self.class_feat, axis=1) neg_remaining = neg_remaining.drop(self.class_feat, axis=1) pos_remaining, neg_remaining = base.rm_covered(self.ruleset_, pos_remaining, neg_remaining) if len(pos_remaining) >= 1: if self.verbosity >= 2: print(f'{len(pos_remaining)} pos left. Growing final rules...') newset = self._grow_ruleset(pos_remaining, neg_remaining, initial_ruleset=self.ruleset_, prune_size=self.prune_size, dl_allowance=self.dl_allowance, random_state=random_state) if self.verbosity >= 1: print('GREW FINAL RULES') newset.out_pretty() print() self.ruleset_ = newset else: if self.verbosity >= 1: print('All pos covered\n') ################################################# # Stage 4: Remove any rules that don't improve dl ################################################# if self.verbosity >= 2: print('Optimizing dl...') mdl_subset, _ = _rs_total_bits(self.ruleset_, self.ruleset_.possible_conds, pos_df, neg_df, bestsubset_dl=True, ret_bestsubset=True, verbosity=self.verbosity) self.ruleset_ = mdl_subset if self.verbosity >= 1: print('FINAL RULESET:') self.ruleset_.out_pretty() print()
def _grow_ruleset(self, pos_df, neg_df, prune_size, dl_allowance, initial_ruleset=None, random_state=None): """ Grow a Ruleset with pruning. """ pos_remaining = pos_df.copy() neg_remaining = neg_df.copy() if initial_ruleset is None: ruleset = Ruleset() ruleset._set_possible_conds(pos_df, neg_df) else: ruleset = copy.deepcopy(initial_ruleset) ruleset_dl = None mdl = None # Minimum encountered description length (in bits) dl_diff = 0 if self.verbosity >= 2: print('growing ruleset...') print() while len(pos_remaining) > 0 and dl_diff <= self.dl_allowance: # Grow-prune split remaining uncovered examples pos_growset, pos_pruneset = base.df_shuffled_split( pos_remaining, (1 - prune_size), random_state=random_state) neg_growset, neg_pruneset = base.df_shuffled_split( neg_remaining, (1 - prune_size), random_state=random_state) if self.verbosity >= 2: print( f'pos_growset {len(pos_growset)} pos_pruneset {len(pos_pruneset)}' ) print( f'neg_growset {len(neg_growset)} neg_pruneset {len(neg_pruneset)}' ) if len(pos_growset) == 0: break # Probably safe, but a little dicey to only check pos_growset. # Grow Rule grown_rule = base.grow_rule(pos_growset, neg_growset, ruleset.possible_conds, verbosity=self.verbosity) if grown_rule.isempty(): break # Generated an empty rule b/c no good conds exist # Prune Rule pruned_rule = base.prune_rule(grown_rule, _RIPPER_growphase_prune_metric, pos_pruneset, neg_pruneset, verbosity=self.verbosity) # Add rule; calculate new description length ruleset.add( pruned_rule ) # Unlike IREP, IREP*/RIPPER stopping condition is inclusive: "After each rule is added, the total description length of the rule set and examples is computed." if self.verbosity >= 2: print( f"updated ruleset: {ruleset.truncstr(direction='right')}") print() if ruleset_dl is None: # First Rule to be added rule_dl = _r_theory_bits(pruned_rule, ruleset.possible_conds, verbosity=self.verbosity) theory_dl = rule_dl data_dl = _exceptions_bits(ruleset, pos_df, neg_df, verbosity=self.verbosity) ruleset_dl = theory_dl + data_dl mdl = ruleset_dl else: rule_dl = _r_theory_bits(pruned_rule, ruleset.possible_conds, verbosity=self.verbosity) theory_dl += rule_dl data_dl = _exceptions_bits(ruleset, pos_df, neg_df, verbosity=self.verbosity) ruleset_dl = theory_dl + data_dl dl_diff = ruleset_dl - mdl if self.verbosity >= 3: print(f'rule dl: {rnd(rule_dl)}') print(f'updated theory dl: {rnd(theory_dl)}') print(f'exceptions: {rnd(data_dl)}') print(f'total dl: {rnd(ruleset_dl)}') if dl_diff <= self.dl_allowance: print( f'mdl {rnd(mdl)} (diff {rnd(dl_diff)} <= {rnd(self.dl_allowance)})' ) else: print( f'mdl {rnd(mdl)} dl-halt: diff {rnd(dl_diff)} exceeds allowance ({rnd(self.dl_allowance)})' ) mdl = ruleset_dl if ruleset_dl < mdl else mdl # Remove covered examples pos_remaining, neg_remaining = base.rm_covered( pruned_rule, pos_remaining, neg_remaining) if self.verbosity >= 3: print( f'examples remaining: {len(pos_remaining)} pos, {len(neg_remaining)} neg' ) print() return ruleset
def _optimize_ruleset(self, ruleset, pos_df, neg_df, prune_size, random_state=None): """ Optimization phase. """ if self.verbosity >= 2: print('optimizing ruleset...') print() pos_remaining = pos_df.copy() neg_remaining = neg_df.copy() original_ruleset = copy.deepcopy(ruleset) if self.verbosity >= 4: print('calculate original ruleset potential dl...') original_dl = _rs_total_bits(original_ruleset, original_ruleset.possible_conds, pos_df, neg_df, bestsubset_dl=True, verbosity=self.verbosity) if self.verbosity >= 3: print(f'original ruleset potential dl: {rnd(original_dl)}') print() new_ruleset = copy.deepcopy(ruleset) for i, rule in enumerate(original_ruleset.rules): pos_growset, pos_pruneset = base.df_shuffled_split( pos_remaining, (1 - prune_size), random_state=random_state) neg_growset, neg_pruneset = base.df_shuffled_split( neg_remaining, (1 - prune_size), random_state=random_state) if len(pos_growset) == 0: break # Possible where optimization run > 1 # Create alternative rules if self.verbosity >= 4: print( f'creating replacement for {i} of {len(original_ruleset.rules)}: {ruleset.rules[i]}' ) g_replacement = base.grow_rule(pos_growset, neg_growset, original_ruleset.possible_conds, initial_rule=Rule(), verbosity=self.verbosity) replacement_ruleset = Ruleset( base.i_replaced(original_ruleset.rules, i, g_replacement)) pr_replacement = base.prune_rule( g_replacement, _RIPPER_optimization_prune_metric, pos_pruneset, neg_pruneset, eval_index_on_ruleset=(i, replacement_ruleset), verbosity=self.verbosity) replacement_ruleset = Ruleset( base.i_replaced(original_ruleset.rules, i, pr_replacement)) if self.verbosity >= 3: print(f'grew replacement {g_replacement}') print(f'pruned replacement is {pr_replacement}') if self.verbosity >= 3: print( f'creating revision for {i} of {len(original_ruleset.rules)}: {ruleset.rules[i]}' ) g_revision = base.grow_rule(pos_growset, neg_growset, original_ruleset.possible_conds, initial_rule=ruleset.rules[i], verbosity=self.verbosity) revision_ruleset = Ruleset( base.i_replaced(original_ruleset.rules, i, g_revision)) pr_revision = base.prune_rule( g_revision, _RIPPER_optimization_prune_metric, pos_pruneset, neg_pruneset, eval_index_on_ruleset=(i, revision_ruleset), verbosity=self.verbosity) revision_ruleset = Ruleset( base.i_replaced(original_ruleset.rules, i, pr_revision)) if self.verbosity >= 3: print(f'grew revision {g_replacement}') print(f'pruned revision is {pr_replacement}') print() # Calculate alternative Rulesets' respective lowest potential dls to identify the best version if self.verbosity >= 3: print( f'calculate potential dl for ds with replacement {pr_replacement}' ) replacement_dl = _rs_total_bits(replacement_ruleset, original_ruleset.possible_conds, pos_df, neg_df, bestsubset_dl=True, verbosity=self.verbosity)\ if pr_replacement!=rule else original_dl if self.verbosity >= 3: print( f'calculate potential dl for ds with revision {pr_revision}' ) revision_dl = _rs_total_bits(revision_ruleset, original_ruleset.possible_conds, pos_df, neg_df, bestsubset_dl=True, verbosity=self.verbosity)\ if pr_revision!=rule else original_dl best_rule = [rule, pr_replacement, pr_revision][base.argmin( [original_dl, replacement_dl, revision_dl])] if self.verbosity >= 2: print(f'\nrule {i+1} of {len(original_ruleset.rules)}') rep_str = pr_replacement.__str__( ) if pr_replacement != rule else 'unchanged' rev_str = pr_revision.__str__( ) if pr_revision != rule else 'unchanged' best_str = best_rule.__str__( ) if best_rule != rule else 'unchanged' if self.verbosity == 2: print(f'original: {rule}') print(f'replacement: {rep_str}') print(f'revision: {rev_str}') print(f'*best: {best_str}') print() else: print(f'original: {rule}) | {rnd(original_dl)} bits') print( f'replacement: {rep_str} | {rnd(replacement_dl)} bits') print(f'revision: {rev_str} | {rnd(revision_dl)} bits') print( f'*best: {best_str} | {rnd(min([replacement_dl, revision_dl, original_dl]))} bits' ) print() new_ruleset.rules[i] = best_rule # Remove covered examples pos_remaining, neg_remaining = base.rm_covered( rule, pos_remaining, neg_remaining) if self.verbosity >= 3: print( f'examples remaining: {len(pos_remaining)} pos, {len(neg_remaining)} neg' ) print() # If there are no pos data remaining to train optimization (could happen if optimization run >1), keep remaining rules the same if len(pos_remaining) == 0: break return new_ruleset
def fit(self, df, y=None, class_feat=None, pos_class=None, n_discretize_bins=None, random_state=None): """ Fit a Ruleset model using a training DataFrame. args: df <DataFrame>: categorical training dataset y: <iterable>: class labels corresponding to df rows. Parameter y or class_feat (see next) must be provided. class_feat: column name of class feature (Use if class feature is still in df.) pos_class (optional): name of positive class. If not provided, defaults to class of first training example. n_discretize_bins (optional): try to fit apparent numeric attributes into n_discretize_bins discrete bins. Pass None to disable auto-discretization and treat values as categorical. (default=None) random_state: (optional) random state to allow for repeatable results """ ################ # Stage 0: Setup ################ # Set up trainset, set class feature name, and set pos class name df, self.class_feat, self.pos_class = base.trainset_classfeat_posclass( df, y=y, class_feat=class_feat, pos_class=pos_class) # Precalculate rule df lookup #self._set_theory_dl_lookup(df, verbosity=self.verbosity) # Anything to discretize? numeric_feats = base.find_numeric_feats(df, min_unique=n_discretize_bins, ignore_feats=[self.class_feat]) if numeric_feats: if n_discretize_bins is not None: if self.verbosity == 1: print(f'binning data...\n') elif self.verbosity >= 2: print(f'binning features {numeric_feats}...') self.bin_transformer_ = fit_bins( df, n_bins=n_discretize_bins, output=False, ignore_feats=[self.class_feat], verbosity=self.verbosity) binned_df = bin_transform(df, self.bin_transformer_) else: n_unique_values = sum( [len(u) for u in [df[f].unique() for f in numeric_feats]]) warnings.warn( f'Optional param n_discretize_bins=None, but there are apparent numeric features: {numeric_feats}. \n Treating {n_unique_values} numeric values as nominal', RuntimeWarning) binned_df = None else: binned_df = None # Split df into pos, neg classes pos_df, neg_df = base.pos_neg_split( df, self.class_feat, self.pos_class) if binned_df is None else base.pos_neg_split( binned_df, self.class_feat, self.pos_class) pos_df = pos_df.drop(self.class_feat, axis=1) neg_df = neg_df.drop(self.class_feat, axis=1) # Collect possible conds self._set_possible_conds(df) ############################### # Stage 1: Grow initial Ruleset ############################### self.ruleset_ = Ruleset() self.ruleset_ = self._grow_ruleset(pos_df, neg_df, prune_size=self.prune_size, dl_allowance=self.dl_allowance, random_state=random_state) if self.verbosity >= 1: print() print('GREW INITIAL RULESET:') self.ruleset_.out_pretty() print() ########################### # Stage 2: Optimize Ruleset ########################### for iter in range(self.k): # Create new but reproducible random_state (if applicable) iter_random_state = random_state + 100 if random_state is not None else None # Run optimization iteration if self.verbosity >= 1: print(f'optimization run {iter+1} of {self.k}') newset = self._optimize_ruleset(self.ruleset_, pos_df, neg_df, prune_size=self.prune_size, random_state=iter_random_state) if self.verbosity >= 1: print() print('OPTIMIZED RULESET:') if self.verbosity >= 2: print( f'iteration {iter+1} of {self.k}\n modified rules {[i for i in range(len(self.ruleset_.rules)) if self.ruleset_.rules[i]!= newset.rules[i]]}' ) newset.out_pretty() print() self.ruleset_ = newset ############################################# # Stage 3: Cover any last remaining positives ############################################# pos_remaining, neg_remaining = base.pos_neg_split( df, self.class_feat, self.pos_class) pos_remaining = pos_remaining.drop(self.class_feat, axis=1) neg_remaining = neg_remaining.drop(self.class_feat, axis=1) pos_remaining, neg_remaining = base.rm_covered(self.ruleset_, pos_remaining, neg_remaining) if len(pos_remaining) >= 1: if self.verbosity >= 2: print(f'{len(pos_remaining)} pos left. Growing final rules...') newset = self._grow_ruleset(pos_remaining, neg_remaining, initial_ruleset=self.ruleset_, prune_size=self.prune_size, dl_allowance=self.dl_allowance, random_state=random_state) if self.verbosity >= 1: print('GREW FINAL RULES') newset.out_pretty() print() self.ruleset_ = newset else: if self.verbosity >= 1: print('All pos covered\n') ################################################# # Stage 4: Remove any rules that don't improve dl ################################################# if self.verbosity >= 2: print('Optimizing dl...') mdl_subset, _ = _rs_total_bits(self.ruleset_, self.ruleset_.possible_conds, pos_df, neg_df, bestsubset_dl=True, ret_bestsubset=True, verbosity=self.verbosity) self.ruleset_ = mdl_subset if self.verbosity >= 1: print('FINAL RULESET:') self.ruleset_.out_pretty() print()