def extract_rulefit(X, y, feature_names, tree_size=4, max_rules=2000, memory_par=0.01, tree_generator=None, exp_rand_tree_size=True, random_state=None) -> List[str]: if tree_generator is None: n_estimators_default = int(np.ceil(max_rules / tree_size)) sample_fract_ = min(0.5, (100 + 6 * np.sqrt(X.shape[0])) / X.shape[0]) tree_generator = GradientBoostingRegressor(n_estimators=n_estimators_default, max_leaf_nodes=tree_size, learning_rate=memory_par, subsample=sample_fract_, random_state=random_state, max_depth=100) if type(tree_generator) not in [GradientBoostingRegressor, RandomForestRegressor]: raise ValueError("RuleFit only works with RandomForest and BoostingRegressor") ## fit tree generator if not exp_rand_tree_size: # simply fit with constant tree size tree_generator.fit(X, y) else: # randomise tree size as per Friedman 2005 Sec 3.3 np.random.seed(random_state) tree_sizes = np.random.exponential(scale=tree_size - 2, size=int(np.ceil(max_rules * 2 / tree_size))) tree_sizes = np.asarray([2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes))], dtype=int) tree_generator.set_params(warm_start=True) curr_est_ = 0 for i_size in np.arange(len(tree_sizes)): size = tree_sizes[i_size] tree_generator.set_params(n_estimators=curr_est_ + 1) tree_generator.set_params(max_leaf_nodes=size) random_state_add = random_state if random_state else 0 tree_generator.set_params( random_state=i_size + random_state_add) # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here. tree_generator.fit(np.copy(X, order='C'), np.copy(y, order='C')) curr_est_ = curr_est_ + 1 tree_generator.set_params(warm_start=False) if isinstance(tree_generator, RandomForestRegressor): estimators_ = [[x] for x in tree_generator.estimators_] else: estimators_ = tree_generator.estimators_ seen_antecedents = set() extracted_rules = [] for estimator in estimators_: for rule_value_pair in tree_to_rules(estimator[0], np.array(feature_names), prediction_values=True): if rule_value_pair[0] not in seen_antecedents: extracted_rules.append(rule_value_pair) seen_antecedents.add(rule_value_pair[0]) extracted_rules = sorted(extracted_rules, key=lambda x: x[1]) extracted_rules = list(map(lambda x: x[0], extracted_rules)) return extracted_rules
def _extract_rules(self): self.estimators_, self.estimators_samples_, self.estimators_features_ = [], [], [] for ensemble in self.tree_generators: self.estimators_ += ensemble.estimators_ self.estimators_samples_ += ensemble.estimators_samples_ self.estimators_features_ += ensemble.estimators_features_ extracted_rules = [] for estimator, features in zip(self.estimators_, self.estimators_features_): extracted_rules.append(tree_to_rules(estimator, np.array(self.feature_names_)[features])) return extracted_rules
def _extract_rules(self): seen_antecedents = set() extracted_rules = [] for estimator in self.estimators_: for rule_value_pair in tree_to_rules(estimator[0], np.array(self.feature_names_), prediction_values=True): if rule_value_pair[0] not in seen_antecedents: extracted_rules.append(rule_value_pair) seen_antecedents.add(rule_value_pair[0]) extracted_rules = sorted(extracted_rules, key=lambda x: x[1]) extracted_rules = list(map(lambda x: x[0], extracted_rules)) return extracted_rules
def fit(self, X, y, feature_names=None, sample_weight=None): """Fit the model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. Has to follow the convention 0 for normal data, 1 for anomalies. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples, typically the amount in case of transactions data. Used to grow regression trees producing further rules to be tested. If not provided, then each sample is given unit weight. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_ = X.shape[1] self.classes_ = unique_labels(y) self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) self.feature_placeholders = list(self.feature_dict_.keys()) self.feature_names = list(self.feature_dict_.values()) n_train = y.shape[0] w = np.ones(n_train) / n_train self.estimators_ = [] self.estimator_weights_ = [] self.estimator_errors_ = [] self.feature_names = feature_names for _ in range(self.n_estimators): # Fit a classifier with the specific weights clf = self.estimator() clf.fit(X, y, sample_weight=w) # uses w as the sampling weight! preds = clf.predict(X) # Indicator function miss = preds != y # Equivalent with 1/-1 to update weights miss2 = np.ones(miss.size) miss2[~miss] = -1 # Error err_m = np.dot(w, miss) / sum(w) if err_m < 1e-3: return self # Alpha alpha_m = 0.5 * np.log((1 - err_m) / float(err_m)) # New weights w = np.multiply(w, np.exp([float(x) * alpha_m for x in miss2])) self.estimators_.append(deepcopy(clf)) self.estimator_weights_.append(alpha_m) self.estimator_errors_.append(err_m) rules = [] for est, est_weight in zip(self.estimators_, self.estimator_weights_): if type(clf) == DecisionTreeClassifier: est_rules_values = tree_to_rules(est, self.feature_placeholders, prediction_values=True) est_rules = list(map(lambda x: x[0], est_rules_values)) # BRS scores are difference between class 1 % and class 0 % in a node est_values = np.array(list(map(lambda x: x[1], est_rules_values))) rule_scores = (est_values[:, 1] - est_values[:, 0]) / est_values.sum(axis=1) compos_score = est_weight * rule_scores rules += [Rule(r, args=[w]) for (r, w) in zip(est_rules, compos_score)] if type(clf) == SlipperClassifier: # SLIPPER uses uniform confidence over in rule observations est_rule = dict_to_rule(est.rule, est.feature_dict) rules += [Rule(est_rule, args=[est_weight])] self.rules_without_feature_names_ = rules self.rules_ = [ replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ ] self.complexity_ = self._get_complexity() return self
def extract_skope(X, y, feature_names, sample_weight=None, n_estimators=10, max_samples=.8, max_samples_features=1., bootstrap=False, bootstrap_features=False, max_depths=[3], max_depth_duplication=None, max_features=1., min_samples_split=2, n_jobs=1, random_state=None, verbose=0) -> Tuple[List[str], List[np.array], List[np.array]]: ensembles = [] if not isinstance(max_depths, Iterable): max_depths = [max_depths] for max_depth in max_depths: bagging_clf = BaggingRegressor( base_estimator= DecisionTreeRegressor( max_depth=max_depth, max_features=max_features, min_samples_split=min_samples_split ), n_estimators=n_estimators, max_samples=max_samples, max_features=max_samples_features, bootstrap=bootstrap, bootstrap_features=bootstrap_features, # oob_score=... XXX may be added # if selection on tree perf needed. # warm_start=... XXX may be added to increase computation perf. n_jobs=n_jobs, random_state=random_state, verbose=verbose ) ensembles.append(bagging_clf) y_reg = y if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) weights = sample_weight - sample_weight.min() contamination = float(sum(y)) / len(y) y_reg = ( pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow((weights).mean(), 0.5) * (y == 0) ) y_reg = 1. / (1 + np.exp(-y_reg)) # sigmoid for e in ensembles[:len(ensembles) // 2]: e.fit(X, y) for e in ensembles[len(ensembles) // 2:]: e.fit(X, y_reg) estimators_, estimators_samples_, estimators_features_ = [], [], [] for ensemble in ensembles: estimators_ += ensemble.estimators_ estimators_samples_ += ensemble.estimators_samples_ estimators_features_ += ensemble.estimators_features_ extracted_rules = [] for estimator, features in zip(estimators_, estimators_features_): extracted_rules.append(tree_to_rules(estimator, np.array(feature_names)[features])) return extracted_rules, estimators_samples_, estimators_features_
def fit(self, X, y, sample_weight=None) -> 'SkopeRulesClassifier': """Fit the model according to the given training data. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape (n_samples,) Target vector relative to X. Has to follow the convention 0 for normal data, 1 for anomalies. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples, typically the amount in case of transactions data. Used to grow regression trees producing further rules to be tested. If not provided, then each sample is given unit weight. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_ = X.shape[1] self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError( "This method needs samples of at least 2 classes in the data, but the data contains only one class: %r" % self.classes_[0]) if not isinstance(self.max_depth_duplication, int) and self.max_depth_duplication is not None: raise ValueError("max_depth_duplication should be an integer") if not set(self.classes_) == {0, 1}: warn( "Found labels %s. This method assumes target class to be labeled as 1 and normal data to be labeled as " "0. Any label different from 0 will be considered as being from the target class." % set(self.classes_)) y = (y > 0) # ensure that max_samples is in [1, n_samples]: n_samples = X.shape[0] if isinstance(self.max_samples, six.string_types): raise ValueError( 'max_samples (%s) is not supported. Valid choices are: "auto", int or float' % self.max_samples) elif isinstance(self.max_samples, INTEGER_TYPES): if self.max_samples > n_samples: warn( "max_samples (%s) is greater than the total number of samples (%s). max_samples will be set " "to n_samples for estimation." % (self.max_samples, n_samples)) max_samples = n_samples else: max_samples = self.max_samples else: # float if not (0. < self.max_samples <= 1.): raise ValueError("max_samples must be in (0, 1], got %r" % self.max_samples) max_samples = int(self.max_samples * X.shape[0]) self.max_samples_ = max_samples # default columns names : feature_names_ = [ BASE_FEATURE_NAME + x for x in np.arange(X.shape[1]).astype(str) ] if self.feature_names is not None: self.feature_dict_ = { BASE_FEATURE_NAME + str(i): feat for i, feat in enumerate(self.feature_names) } else: self.feature_dict_ = { BASE_FEATURE_NAME + str(i): feat for i, feat in enumerate(feature_names_) } self.feature_names_ = feature_names_ self._max_depths = self.max_depth \ if isinstance(self.max_depth, Iterable) else [self.max_depth] # define regression target: if sample_weight is not None: sample_weight = check_array(sample_weight, ensure_2d=False) weights = sample_weight - sample_weight.min() contamination = float(sum(y)) / len(y) y_reg = (pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow( (weights).mean(), 0.5) * (y == 0)) y_reg = 1. / (1 + np.exp(-y_reg)) # sigmoid else: y_reg = y # same as an other classification bagging clfs = self._get_tree_ensemble(classify=True) regs = self._get_tree_ensemble(classify=False) self._fit_tree_ensemble(clfs, X, y) self._fit_tree_ensemble(regs, X, y_reg) self.estimators_, self.estimators_samples_, self.estimators_features_ = [], [], [] for ensemble in clfs + regs: self.estimators_ += ensemble.estimators_ self.estimators_samples_ += ensemble.estimators_samples_ self.estimators_features_ += ensemble.estimators_features_ rules_ = [] for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): rules_from_tree = tree_to_rules( estimator, np.array(self.feature_names_)[features]) rules_ += self._add_OOB_scores_to_rules(X, y, rules_from_tree, samples, features) self.rules_ = self._filter_rules(rules_) self.rules_ = sorted(self.rules_, key=lambda x: -self.f1_score(x)) self.rules_without_feature_names_ = self.rules_ # Replace generic feature names by real feature names self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf) for rule, perf in self.rules_] return self