Beispiel #1
0
def extract_rulefit(X, y, feature_names,
                    tree_size=4,
                    max_rules=2000,
                    memory_par=0.01,
                    tree_generator=None,
                    exp_rand_tree_size=True,
                    random_state=None) -> List[str]:

    if tree_generator is None:
        n_estimators_default = int(np.ceil(max_rules / tree_size))
        sample_fract_ = min(0.5, (100 + 6 * np.sqrt(X.shape[0])) / X.shape[0])

        tree_generator = GradientBoostingRegressor(n_estimators=n_estimators_default,
                                                    max_leaf_nodes=tree_size,
                                                    learning_rate=memory_par,
                                                    subsample=sample_fract_,
                                                    random_state=random_state,
                                                    max_depth=100)

    if type(tree_generator) not in [GradientBoostingRegressor, RandomForestRegressor]:
        raise ValueError("RuleFit only works with RandomForest and BoostingRegressor")

    ## fit tree generator
    if not exp_rand_tree_size:  # simply fit with constant tree size
        tree_generator.fit(X, y)
    else:  # randomise tree size as per Friedman 2005 Sec 3.3
        np.random.seed(random_state)
        tree_sizes = np.random.exponential(scale=tree_size - 2,
                                            size=int(np.ceil(max_rules * 2 / tree_size)))
        tree_sizes = np.asarray([2 + np.floor(tree_sizes[i_]) for i_ in np.arange(len(tree_sizes))], dtype=int)
        tree_generator.set_params(warm_start=True)
        curr_est_ = 0
        for i_size in np.arange(len(tree_sizes)):
            size = tree_sizes[i_size]
            tree_generator.set_params(n_estimators=curr_est_ + 1)
            tree_generator.set_params(max_leaf_nodes=size)
            random_state_add = random_state if random_state else 0
            tree_generator.set_params(
                random_state=i_size + random_state_add)  # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here.
            tree_generator.fit(np.copy(X, order='C'), np.copy(y, order='C'))
            curr_est_ = curr_est_ + 1
        tree_generator.set_params(warm_start=False)

    if isinstance(tree_generator, RandomForestRegressor):
        estimators_ = [[x] for x in tree_generator.estimators_]
    else:
        estimators_ = tree_generator.estimators_

    seen_antecedents = set()
    extracted_rules = [] 
    for estimator in estimators_:
        for rule_value_pair in tree_to_rules(estimator[0], np.array(feature_names), prediction_values=True):
            if rule_value_pair[0] not in seen_antecedents:
                extracted_rules.append(rule_value_pair)
                seen_antecedents.add(rule_value_pair[0])
    
    extracted_rules = sorted(extracted_rules, key=lambda x: x[1])
    extracted_rules = list(map(lambda x: x[0], extracted_rules))
    return extracted_rules
Beispiel #2
0
    def _extract_rules(self):
        self.estimators_, self.estimators_samples_, self.estimators_features_ = [], [], []
        for ensemble in self.tree_generators:
            self.estimators_ += ensemble.estimators_
            self.estimators_samples_ += ensemble.estimators_samples_
            self.estimators_features_ += ensemble.estimators_features_

        extracted_rules = []
        for estimator, features in zip(self.estimators_, self.estimators_features_):
            extracted_rules.append(tree_to_rules(estimator, np.array(self.feature_names_)[features]))
        return extracted_rules
Beispiel #3
0
 def _extract_rules(self):
     seen_antecedents = set()
     extracted_rules = [] 
     for estimator in self.estimators_:
         for rule_value_pair in tree_to_rules(estimator[0], np.array(self.feature_names_), prediction_values=True):
             if rule_value_pair[0] not in seen_antecedents:
                 extracted_rules.append(rule_value_pair)
                 seen_antecedents.add(rule_value_pair[0])
     
     extracted_rules = sorted(extracted_rules, key=lambda x: x[1])
     extracted_rules = list(map(lambda x: x[0], extracted_rules))
     return extracted_rules
Beispiel #4
0
    def fit(self, X, y, feature_names=None, sample_weight=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X. Has to follow the convention 0 for
            normal data, 1 for anomalies.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples, typically
            the amount in case of transactions data. Used to grow regression
            trees producing further rules to be tested.
            If not provided, then each sample is given unit weight.

        Returns
        -------
        self : object
            Returns self.
        """

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_ = X.shape[1]
        self.classes_ = unique_labels(y)

        self.feature_dict_ = get_feature_dict(X.shape[1], feature_names)
        self.feature_placeholders = list(self.feature_dict_.keys())
        self.feature_names = list(self.feature_dict_.values())

        n_train = y.shape[0]
        w = np.ones(n_train) / n_train
        self.estimators_ = []
        self.estimator_weights_ = []
        self.estimator_errors_ = []
        self.feature_names = feature_names
        for _ in range(self.n_estimators):
            # Fit a classifier with the specific weights
            clf = self.estimator()
            clf.fit(X, y, sample_weight=w)  # uses w as the sampling weight!
            preds = clf.predict(X)

            # Indicator function
            miss = preds != y

            # Equivalent with 1/-1 to update weights
            miss2 = np.ones(miss.size)
            miss2[~miss] = -1

            # Error
            err_m = np.dot(w, miss) / sum(w)
            if err_m < 1e-3:
                return self

            # Alpha
            alpha_m = 0.5 * np.log((1 - err_m) / float(err_m))

            # New weights
            w = np.multiply(w, np.exp([float(x) * alpha_m
                                       for x in miss2]))

            self.estimators_.append(deepcopy(clf))
            self.estimator_weights_.append(alpha_m)
            self.estimator_errors_.append(err_m)

        rules = []

        for est, est_weight in zip(self.estimators_, self.estimator_weights_):
            if type(clf) == DecisionTreeClassifier:
                est_rules_values = tree_to_rules(est, self.feature_placeholders, prediction_values=True)
                est_rules = list(map(lambda x: x[0], est_rules_values))

                # BRS scores are difference between class 1 % and class 0 % in a node
                est_values = np.array(list(map(lambda x: x[1], est_rules_values)))
                rule_scores = (est_values[:, 1] - est_values[:, 0]) / est_values.sum(axis=1)

                compos_score = est_weight * rule_scores
                rules += [Rule(r, args=[w]) for (r, w) in zip(est_rules, compos_score)]

            if type(clf) == SlipperClassifier:
                # SLIPPER uses uniform confidence over in rule observations
                est_rule = dict_to_rule(est.rule, est.feature_dict)
                rules += [Rule(est_rule, args=[est_weight])]

        self.rules_without_feature_names_ = rules
        self.rules_ = [
            replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_
        ]
        self.complexity_ = self._get_complexity()
        return self
Beispiel #5
0
def extract_skope(X, y, feature_names, 
                  sample_weight=None,
                  n_estimators=10,
                  max_samples=.8,
                  max_samples_features=1.,
                  bootstrap=False,
                  bootstrap_features=False,
                  max_depths=[3], 
                  max_depth_duplication=None,
                  max_features=1.,
                  min_samples_split=2,
                  n_jobs=1,
                  random_state=None,
                  verbose=0) -> Tuple[List[str], List[np.array], List[np.array]]:
    
    ensembles = []
    if not isinstance(max_depths, Iterable):
        max_depths = [max_depths]

    for max_depth in max_depths:
        bagging_clf = BaggingRegressor(
            base_estimator= DecisionTreeRegressor(
                max_depth=max_depth,
                max_features=max_features,
                min_samples_split=min_samples_split
            ),
            n_estimators=n_estimators,
            max_samples=max_samples,
            max_features=max_samples_features,
            bootstrap=bootstrap,
            bootstrap_features=bootstrap_features,
            # oob_score=... XXX may be added
            # if selection on tree perf needed.
            # warm_start=... XXX may be added to increase computation perf.
            n_jobs=n_jobs,
            random_state=random_state,
            verbose=verbose
        )
        ensembles.append(bagging_clf)

    y_reg = y
    if sample_weight is not None:
        sample_weight = check_array(sample_weight, ensure_2d=False)
        weights = sample_weight - sample_weight.min()
        contamination = float(sum(y)) / len(y)
        y_reg = (
                pow(weights, 0.5) * 0.5 / contamination * (y > 0) -
                pow((weights).mean(), 0.5) * (y == 0)
        )
        y_reg = 1. / (1 + np.exp(-y_reg))  # sigmoid

    for e in ensembles[:len(ensembles) // 2]:
        e.fit(X, y)

    for e in ensembles[len(ensembles) // 2:]:
        e.fit(X, y_reg)

    estimators_, estimators_samples_, estimators_features_ = [], [], []
    for ensemble in ensembles:
        estimators_ += ensemble.estimators_
        estimators_samples_ += ensemble.estimators_samples_
        estimators_features_ += ensemble.estimators_features_

    extracted_rules = []
    for estimator, features in zip(estimators_, estimators_features_):
        extracted_rules.append(tree_to_rules(estimator, np.array(feature_names)[features]))
    
    return extracted_rules, estimators_samples_, estimators_features_
Beispiel #6
0
    def fit(self, X, y, sample_weight=None) -> 'SkopeRulesClassifier':
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X. Has to follow the convention 0 for
            normal data, 1 for anomalies.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples, typically
            the amount in case of transactions data. Used to grow regression
            trees producing further rules to be tested.
            If not provided, then each sample is given unit weight.

        Returns
        -------
        self : object
            Returns self.
        """

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_ = X.shape[1]

        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)

        if n_classes < 2:
            raise ValueError(
                "This method needs samples of at least 2 classes in the data, but the data contains only one class: %r"
                % self.classes_[0])

        if not isinstance(self.max_depth_duplication,
                          int) and self.max_depth_duplication is not None:
            raise ValueError("max_depth_duplication should be an integer")

        if not set(self.classes_) == {0, 1}:
            warn(
                "Found labels %s. This method assumes target class to be labeled as 1 and normal data to be labeled as "
                "0. Any label different from 0 will be considered as being from the target class."
                % set(self.classes_))
            y = (y > 0)

        # ensure that max_samples is in [1, n_samples]:
        n_samples = X.shape[0]

        if isinstance(self.max_samples, six.string_types):
            raise ValueError(
                'max_samples (%s) is not supported. Valid choices are: "auto", int or float'
                % self.max_samples)

        elif isinstance(self.max_samples, INTEGER_TYPES):
            if self.max_samples > n_samples:
                warn(
                    "max_samples (%s) is greater than the total number of samples (%s). max_samples will be set "
                    "to n_samples for estimation." %
                    (self.max_samples, n_samples))
                max_samples = n_samples
            else:
                max_samples = self.max_samples
        else:  # float
            if not (0. < self.max_samples <= 1.):
                raise ValueError("max_samples must be in (0, 1], got %r" %
                                 self.max_samples)
            max_samples = int(self.max_samples * X.shape[0])
        self.max_samples_ = max_samples

        # default columns names :
        feature_names_ = [
            BASE_FEATURE_NAME + x for x in np.arange(X.shape[1]).astype(str)
        ]
        if self.feature_names is not None:
            self.feature_dict_ = {
                BASE_FEATURE_NAME + str(i): feat
                for i, feat in enumerate(self.feature_names)
            }
        else:
            self.feature_dict_ = {
                BASE_FEATURE_NAME + str(i): feat
                for i, feat in enumerate(feature_names_)
            }
        self.feature_names_ = feature_names_

        self._max_depths = self.max_depth \
            if isinstance(self.max_depth, Iterable) else [self.max_depth]

        # define regression target:
        if sample_weight is not None:
            sample_weight = check_array(sample_weight, ensure_2d=False)
            weights = sample_weight - sample_weight.min()
            contamination = float(sum(y)) / len(y)
            y_reg = (pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow(
                (weights).mean(), 0.5) * (y == 0))
            y_reg = 1. / (1 + np.exp(-y_reg))  # sigmoid
        else:
            y_reg = y  # same as an other classification bagging

        clfs = self._get_tree_ensemble(classify=True)
        regs = self._get_tree_ensemble(classify=False)

        self._fit_tree_ensemble(clfs, X, y)
        self._fit_tree_ensemble(regs, X, y_reg)

        self.estimators_, self.estimators_samples_, self.estimators_features_ = [], [], []

        for ensemble in clfs + regs:
            self.estimators_ += ensemble.estimators_
            self.estimators_samples_ += ensemble.estimators_samples_
            self.estimators_features_ += ensemble.estimators_features_

        rules_ = []
        for estimator, samples, features in zip(self.estimators_,
                                                self.estimators_samples_,
                                                self.estimators_features_):

            rules_from_tree = tree_to_rules(
                estimator,
                np.array(self.feature_names_)[features])
            rules_ += self._add_OOB_scores_to_rules(X, y, rules_from_tree,
                                                    samples, features)

        self.rules_ = self._filter_rules(rules_)
        self.rules_ = sorted(self.rules_, key=lambda x: -self.f1_score(x))
        self.rules_without_feature_names_ = self.rules_

        # Replace generic feature names by real feature names
        self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf)
                       for rule, perf in self.rules_]
        return self