Exemple #1
0
    def fit(self, X, y=None, feature_names=None):
        """Fit and estimate linear combination of rule ensemble

        """
        if type(X) == pd.DataFrame:
            X = X.values
        if type(y) in [pd.DataFrame, pd.Series]:
            y = y.values

        self.n_features_ = X.shape[1]
        self.feature_dict_ = get_feature_dict(X.shape[1], feature_names)
        self.feature_placeholders = list(self.feature_dict_.keys())
        self.feature_names = list(self.feature_dict_.values())

        extracted_rules = self._extract_rules(X, y)
        self.rules_without_feature_names_, self.coef, self.intercept = self._score_rules(X, y, extracted_rules)
        self.rules_ = [
            replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_
        ]
        self.complexity = self._get_complexity()

        return self
Exemple #2
0
    def fit(self, X, y=None, feature_names=None):
        """Fit and estimate linear combination of rule ensemble

        """
        X, y = check_X_y(X, y)
        self.n_features_in_ = X.shape[1]

        self.n_features_ = X.shape[1]
        self.feature_dict_ = get_feature_dict(X.shape[1], feature_names)
        self.feature_placeholders = list(self.feature_dict_.keys())
        self.feature_names = list(self.feature_dict_.values())

        extracted_rules = self._extract_rules(X, y)
        self.rules_without_feature_names_, self.coef, self.intercept = self._score_rules(
            X, y, extracted_rules)
        self.rules_ = [
            replace_feature_name(rule, self.feature_dict_)
            for rule in self.rules_without_feature_names_
        ]
        self.complexity_ = self._get_complexity()

        return self
Exemple #3
0
    def fit(self, X, y, feature_names=None, sample_weight=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X. Has to follow the convention 0 for
            normal data, 1 for anomalies.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples, typically
            the amount in case of transactions data. Used to grow regression
            trees producing further rules to be tested.
            If not provided, then each sample is given unit weight.

        Returns
        -------
        self : object
            Returns self.
        """

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_ = X.shape[1]
        self.classes_ = unique_labels(y)

        self.feature_dict_ = get_feature_dict(X.shape[1], feature_names)
        self.feature_placeholders = list(self.feature_dict_.keys())
        self.feature_names = list(self.feature_dict_.values())

        n_train = y.shape[0]
        w = np.ones(n_train) / n_train
        self.estimators_ = []
        self.estimator_weights_ = []
        self.estimator_errors_ = []
        self.feature_names = feature_names
        for _ in range(self.n_estimators):
            # Fit a classifier with the specific weights
            clf = self.estimator()
            clf.fit(X, y, sample_weight=w)  # uses w as the sampling weight!
            preds = clf.predict(X)

            # Indicator function
            miss = preds != y

            # Equivalent with 1/-1 to update weights
            miss2 = np.ones(miss.size)
            miss2[~miss] = -1

            # Error
            err_m = np.dot(w, miss) / sum(w)
            if err_m < 1e-3:
                return self

            # Alpha
            alpha_m = 0.5 * np.log((1 - err_m) / float(err_m))

            # New weights
            w = np.multiply(w, np.exp([float(x) * alpha_m
                                       for x in miss2]))

            self.estimators_.append(deepcopy(clf))
            self.estimator_weights_.append(alpha_m)
            self.estimator_errors_.append(err_m)

        rules = []

        for est, est_weight in zip(self.estimators_, self.estimator_weights_):
            if type(clf) == DecisionTreeClassifier:
                est_rules_values = tree_to_rules(est, self.feature_placeholders, prediction_values=True)
                est_rules = list(map(lambda x: x[0], est_rules_values))

                # BRS scores are difference between class 1 % and class 0 % in a node
                est_values = np.array(list(map(lambda x: x[1], est_rules_values)))
                rule_scores = (est_values[:, 1] - est_values[:, 0]) / est_values.sum(axis=1)

                compos_score = est_weight * rule_scores
                rules += [Rule(r, args=[w]) for (r, w) in zip(est_rules, compos_score)]

            if type(clf) == SlipperClassifier:
                # SLIPPER uses uniform confidence over in rule observations
                est_rule = dict_to_rule(est.rule, est.feature_dict)
                rules += [Rule(est_rule, args=[est_weight])]

        self.rules_without_feature_names_ = rules
        self.rules_ = [
            replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_
        ]
        self.complexity_ = self._get_complexity()
        return self
Exemple #4
0
    def fit(self,
            X,
            y,
            feature_names=None,
            sample_weight=None) -> 'SkopeRulesClassifier':
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X. Has to follow the convention 0 for
            normal data, 1 for anomalies.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples, typically
            the amount in case of transactions data. Used to grow regression
            trees producing further rules to be tested.
            If not provided, then each sample is given unit weight.

        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_ = X.shape[1]
        self.sample_weight = sample_weight
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)

        if n_classes < 2:
            raise ValueError(
                "This method needs samples of at least 2 classes in the data, but the data contains only one class: %r"
                % self.classes_[0])

        if not isinstance(self.max_depth_duplication,
                          int) and self.max_depth_duplication is not None:
            raise ValueError("max_depth_duplication should be an integer")

        if not set(self.classes_) == {0, 1}:
            warn(
                "Found labels %s. This method assumes target class to be labeled as 1 and normal data to be labeled as "
                "0. Any label different from 0 will be considered as being from the target class."
                % set(self.classes_))
            y = (y > 0)

        # ensure that max_samples is in [1, n_samples]:
        n_samples = X.shape[0]

        if isinstance(self.max_samples, six.string_types):
            raise ValueError(
                'max_samples (%s) is not supported. Valid choices are: "auto", int or float'
                % self.max_samples)

        elif isinstance(self.max_samples, INTEGER_TYPES):
            if self.max_samples > n_samples:
                warn(
                    "max_samples (%s) is greater than the total number of samples (%s). max_samples will be set "
                    "to n_samples for estimation." %
                    (self.max_samples, n_samples))
                max_samples = n_samples
            else:
                max_samples = self.max_samples
        else:  # float
            if not (0. < self.max_samples <= 1.):
                raise ValueError("max_samples must be in (0, 1], got %r" %
                                 self.max_samples)
            max_samples = int(self.max_samples * X.shape[0])
        self.max_samples_ = max_samples
        self._max_depths = self.max_depth if isinstance(
            self.max_depth, Iterable) else [self.max_depth]

        self.feature_names_, self.feature_dict_ = self._enum_features(
            X, feature_names)

        self.tree_generators = self._get_tree_ensemble()
        self._fit_tree_ensemble(X, y)

        extracted_rules = self._extract_rules()
        scored_rules = self._score_rules(X, y, extracted_rules)
        self.rules_ = self._prune_rules(scored_rules)

        self.rules_without_feature_names_ = self.rules_
        self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf)
                       for rule, perf in self.rules_]
        return self
Exemple #5
0
    def fit(self, X, y, sample_weight=None) -> 'SkopeRulesClassifier':
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X. Has to follow the convention 0 for
            normal data, 1 for anomalies.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples, typically
            the amount in case of transactions data. Used to grow regression
            trees producing further rules to be tested.
            If not provided, then each sample is given unit weight.

        Returns
        -------
        self : object
            Returns self.
        """

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_ = X.shape[1]

        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)

        if n_classes < 2:
            raise ValueError(
                "This method needs samples of at least 2 classes in the data, but the data contains only one class: %r"
                % self.classes_[0])

        if not isinstance(self.max_depth_duplication,
                          int) and self.max_depth_duplication is not None:
            raise ValueError("max_depth_duplication should be an integer")

        if not set(self.classes_) == {0, 1}:
            warn(
                "Found labels %s. This method assumes target class to be labeled as 1 and normal data to be labeled as "
                "0. Any label different from 0 will be considered as being from the target class."
                % set(self.classes_))
            y = (y > 0)

        # ensure that max_samples is in [1, n_samples]:
        n_samples = X.shape[0]

        if isinstance(self.max_samples, six.string_types):
            raise ValueError(
                'max_samples (%s) is not supported. Valid choices are: "auto", int or float'
                % self.max_samples)

        elif isinstance(self.max_samples, INTEGER_TYPES):
            if self.max_samples > n_samples:
                warn(
                    "max_samples (%s) is greater than the total number of samples (%s). max_samples will be set "
                    "to n_samples for estimation." %
                    (self.max_samples, n_samples))
                max_samples = n_samples
            else:
                max_samples = self.max_samples
        else:  # float
            if not (0. < self.max_samples <= 1.):
                raise ValueError("max_samples must be in (0, 1], got %r" %
                                 self.max_samples)
            max_samples = int(self.max_samples * X.shape[0])
        self.max_samples_ = max_samples

        # default columns names :
        feature_names_ = [
            BASE_FEATURE_NAME + x for x in np.arange(X.shape[1]).astype(str)
        ]
        if self.feature_names is not None:
            self.feature_dict_ = {
                BASE_FEATURE_NAME + str(i): feat
                for i, feat in enumerate(self.feature_names)
            }
        else:
            self.feature_dict_ = {
                BASE_FEATURE_NAME + str(i): feat
                for i, feat in enumerate(feature_names_)
            }
        self.feature_names_ = feature_names_

        self._max_depths = self.max_depth \
            if isinstance(self.max_depth, Iterable) else [self.max_depth]

        # define regression target:
        if sample_weight is not None:
            sample_weight = check_array(sample_weight, ensure_2d=False)
            weights = sample_weight - sample_weight.min()
            contamination = float(sum(y)) / len(y)
            y_reg = (pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow(
                (weights).mean(), 0.5) * (y == 0))
            y_reg = 1. / (1 + np.exp(-y_reg))  # sigmoid
        else:
            y_reg = y  # same as an other classification bagging

        clfs = self._get_tree_ensemble(classify=True)
        regs = self._get_tree_ensemble(classify=False)

        self._fit_tree_ensemble(clfs, X, y)
        self._fit_tree_ensemble(regs, X, y_reg)

        self.estimators_, self.estimators_samples_, self.estimators_features_ = [], [], []

        for ensemble in clfs + regs:
            self.estimators_ += ensemble.estimators_
            self.estimators_samples_ += ensemble.estimators_samples_
            self.estimators_features_ += ensemble.estimators_features_

        rules_ = []
        for estimator, samples, features in zip(self.estimators_,
                                                self.estimators_samples_,
                                                self.estimators_features_):

            rules_from_tree = tree_to_rules(
                estimator,
                np.array(self.feature_names_)[features])
            rules_ += self._add_OOB_scores_to_rules(X, y, rules_from_tree,
                                                    samples, features)

        self.rules_ = self._filter_rules(rules_)
        self.rules_ = sorted(self.rules_, key=lambda x: -self.f1_score(x))
        self.rules_without_feature_names_ = self.rules_

        # Replace generic feature names by real feature names
        self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf)
                       for rule, perf in self.rules_]
        return self
    def fit(self,
            X,
            y,
            feature_names: list = None,
            undiscretized_features=[],
            verbose=False):
        """Fit rule lists to data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data

        y : array_like, shape = [n_samples]
            Labels
            
        feature_names : array_like, shape = [n_features], optional (default: [])
            String labels for each feature.
            If empty and X is a DataFrame, column labels are used.
            If empty and X is not a DataFrame, then features are simply enumerated
            
        undiscretized_features : array_like, shape = [n_features], optional (default: [])
            String labels for each feature which is NOT to be discretized.
            If empty, all numeric features are discretized
            
        verbose : bool
            Currently doesn't do anything

        Returns
        -------
        self : returns an instance of self.
        """
        self.seed()

        if len(set(y)) != 2:
            raise Exception(
                "Only binary classification is supported at this time!")

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_in_ = X.shape[1]
        self.classes_ = unique_labels(y)

        self.feature_dict_ = get_feature_dict(X.shape[1], feature_names)
        self.feature_placeholders = np.array(list(self.feature_dict_.keys()))
        self.feature_names = np.array(list(self.feature_dict_.values()))

        itemsets, self.discretizer = extract_fpgrowth(
            X,
            y,
            feature_names=self.feature_placeholders,
            minsupport=self.minsupport,
            maxcardinality=self.maxcardinality,
            undiscretized_features=undiscretized_features,
            disc_strategy=self.disc_strategy,
            disc_kwargs=self.disc_kwargs,
            verbose=verbose)
        X_df_onehot = self.discretizer.transform(X)

        # Now form the data-vs.-lhs set
        # X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
        for c in X_df_onehot.columns:
            X_df_onehot[c] = [
                c if x == 1 else '' for x in list(X_df_onehot[c])
            ]
        X = [{}] * (len(itemsets) + 1)
        X[0] = set(range(
            len(X_df_onehot)))  # the default rule satisfies all data
        for (j, lhs) in enumerate(itemsets):
            X[j + 1] = set([
                i for (i, xi) in enumerate(X_df_onehot.values)
                if set(lhs).issubset(xi)
            ])

        # now form lhs_len
        lhs_len = [0]
        for lhs in itemsets:
            lhs_len.append(len(lhs))
        nruleslen = Counter(lhs_len)
        lhs_len = np.array(lhs_len)
        itemsets_all = ['null']
        itemsets_all.extend(itemsets)

        Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = (
            X, np.vstack((1 - np.array(y),
                          y)).T.astype(int), nruleslen, lhs_len, itemsets_all)

        permsdic = defaultdict(
            default_permsdic)  # We will store here the MCMC results
        # Do MCMC
        res, Rhat = run_bdl_multichain_serial(self.max_iter,
                                              self.thinning,
                                              self.alpha,
                                              self.listlengthprior,
                                              self.listwidthprior,
                                              Xtrain,
                                              Ytrain,
                                              nruleslen,
                                              lhs_len,
                                              self.maxcardinality,
                                              permsdic,
                                              self.burnin,
                                              self.n_chains,
                                              [None] * self.n_chains,
                                              verbose=self.verbose,
                                              seed=self.random_state)

        # Merge the chains
        permsdic = merge_chains(res)

        # The point estimate, BRL-point
        self.d_star = get_point_estimate(
            permsdic,
            lhs_len,
            Xtrain,
            Ytrain,
            self.alpha,
            nruleslen,
            self.maxcardinality,
            self.listlengthprior,
            self.listwidthprior,
            verbose=self.verbose)  # get the point estimate

        if self.d_star:
            # Compute the rule consequent
            self.theta, self.ci_theta = get_rule_rhs(Xtrain, Ytrain,
                                                     self.d_star, self.alpha,
                                                     True)

        self.final_itemsets = np.array(self.itemsets,
                                       dtype=object)[self.d_star]
        rule_strs = itemsets_to_rules(self.final_itemsets)
        self.rules_without_feature_names_ = [Rule(r) for r in rule_strs]
        self.rules_ = [
            replace_feature_name(rule, self.feature_dict_)
            for rule in self.rules_without_feature_names_
        ]

        self.complexity_ = self._get_complexity()

        return self