Example #1
0
def test_f1_score():
    rule0 = Rule('a > 0', (0, 0, 0))
    rule1 = Rule('a > 0', (0.5, 0.5, 0))
    rule2 = Rule('a > 0', (0.5, 0, 0))

    assert f1_score(rule0) == 0
    assert f1_score(rule1) == 0.5
    assert f1_score(rule2) == 0
Example #2
0
def split(rule: Rule) -> List[Rule]:
    if len(rule.agg_dict) == 1:
        return [rule]
    else:
        indv_rule_strs = list(map(lambda x: ' '.join(x), rule.terms))
        indv_rules = list(map(lambda x: Rule(x), indv_rule_strs))
        return indv_rules
Example #3
0
    def _add_OOB_scores_to_rules(self, X, y, rules_from_tree, in_bag_samples,
                                 features):

        # Create mask for OOB samples
        mask = ~in_bag_samples
        if sum(mask) == 0:
            warn(
                "OOB evaluation not possible: doing it in-bag. Performance evaluation is likely to be wrong"
                " (overfitting) and selected rules are likely to not perform well! Please use max_samples < 1."
            )
            mask = in_bag_samples

        # XXX todo: idem without dataframe
        X_oob = pandas.DataFrame(
            (X[mask, :])[:, features],
            columns=np.array(self.feature_names_)[features])

        if X_oob.shape[1] <= 1:  # otherwise pandas bug (cf. issue #16363)
            return []

        y_oob = y[mask]
        y_oob = np.array((y_oob != 0))

        # Add OOB performances to rules:
        rules_from_tree = [
            Rule(r, args=self._eval_rule_perf(r, X_oob, y_oob))
            for r in set(rules_from_tree)
        ]
        return rules_from_tree
Example #4
0
def test_similarity_tree():
    # Test that rules are well splitted
    rules = [Rule("a <= 2 and b > 45 and c <= 3 and a > 4", args=(1, 1, 0)),
             Rule("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)),
             Rule("a > 2 and b > 45", (0.5, 0.3, 0)),
             Rule("a > 2 and b > 40", (0.5, 0.2, 0)),
             Rule("a <= 2 and b <= 45", (1, 1, 0)),
             Rule("a > 2 and c <= 3", (1, 1, 0)),
             Rule("b > 45", (1, 1, 0))]

    sk = SkopeRulesClassifier(max_depth_duplication=2)
    rulesets = find_similar_rulesets(rules, max_depth_duplication=2)
    # Assert some couples of rules are in the same bag
    idx_bags_rules = []
    for idx_rule, r in enumerate(rules):
        idx_bags_for_rule = []
        for idx_bag, bag in enumerate(rulesets):
            if r in bag:
                idx_bags_for_rule.append(idx_bag)
        idx_bags_rules.append(idx_bags_for_rule)

    assert idx_bags_rules[0] == idx_bags_rules[1]
    assert not idx_bags_rules[0] == idx_bags_rules[2]
    # Assert the best rules are kept
    final_rules = deduplicate(rules, sk.max_depth_duplication)
    assert rules[0] in final_rules
    assert rules[2] in final_rules
    assert not rules[3] in final_rules
Example #5
0
def score_lasso(X, y, rules: List[str], alphas=None, cv=3,
                prediction_task='regression',
                max_rules=2000, random_state=None) -> Tuple[List[Rule], List[float], float]:
    if alphas is None:
        if prediction_task == 'regression':
            alphas = _alpha_grid(X, y)
        elif prediction_task == 'classification':
            alphas = [1 / alpha
                     for alpha in np.logspace(-4, 4, num=10, base=10)]

    coef_zero_threshold = 1e-6 / np.mean(np.abs(y))
    mse_cv_scores = []
    nonzero_rule_coefs_count = []
    kf = KFold(cv)
    
    # alphas are sorted from most reg. to least reg.
    for alpha in alphas: 
        
        if prediction_task == 'regression':
            m = Lasso(alpha=alpha, random_state=random_state)
        else:
            m = LogisticRegression(penalty='l1', C=1/alpha, solver='liblinear')
        mse_cv = 0
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            m.fit(X_train, y_train)
            mse_cv += np.mean((m.predict(X_test) - y_test) ** 2)
        
        m.fit(X, y)
        
        rule_count = np.sum(np.abs(m.coef_.flatten()) > coef_zero_threshold)
        if rule_count > max_rules:
            break
        nonzero_rule_coefs_count.append(rule_count)
        mse_cv_scores.append(mse_cv / cv)
    
    best_alpha = alphas[np.argmin(mse_cv_scores)]
    if prediction_task == 'regression':
        lscv = Lasso(alpha=best_alpha, random_state=random_state, max_iter=2000)
    else:
        lscv = LogisticRegression(penalty='l1', C=1/best_alpha, solver='liblinear',
                                  random_state=random_state, max_iter=200)
    lscv.fit(X, y)

    coef_ = lscv.coef_.flatten()
    coefs = list(coef_[:-len(rules)])
    support = np.sum(X[:, -len(rules):], axis=0) / X.shape[0]

    nonzero_rules = []
    for r, w, s in zip(rules, coef_[-len(rules):], support):
        if abs(w) > coef_zero_threshold:
            nonzero_rules.append(Rule(r, args=[w], support=s))
            coefs.append(w)
    
    return nonzero_rules, coefs, lscv.intercept_
Example #6
0
def score_linear(X,
                 y,
                 rules: List[str],
                 penalty='l1',
                 prediction_task='regression',
                 max_rules=30,
                 alpha=None,
                 random_state=None) -> Tuple[List[Rule], List[float], float]:

    if alpha is not None and max_rules is None:
        final_alpha = alpha
    elif max_rules is not None and alpha is None:
        final_alpha = get_best_alpha_under_max_rules(
            X,
            y,
            rules,
            penalty=penalty,
            prediction_task=prediction_task,
            max_rules=max_rules,
            random_state=random_state)
    else:
        raise ValueError("max_rules and alpha cannot be used together")

    if prediction_task == 'regression':
        lscv = Lasso(alpha=final_alpha,
                     random_state=random_state,
                     max_iter=2000)
    else:
        lscv = LogisticRegression(penalty=penalty,
                                  C=1 / final_alpha,
                                  solver='liblinear',
                                  random_state=random_state,
                                  max_iter=200)
    lscv.fit(X, y)

    coef_ = lscv.coef_.flatten()
    coefs = list(coef_[:coef_.shape[0] - len(rules)])
    support = np.sum(X[:, -len(rules):], axis=0) / X.shape[0]

    nonzero_rules = []
    coef_zero_threshold = 1e-6 / np.mean(np.abs(y))
    for r, w, s in zip(rules, coef_[-len(rules):], support):
        if abs(w) > coef_zero_threshold:
            nonzero_rules.append(Rule(r, args=[w], support=s))
            coefs.append(w)

    return nonzero_rules, coefs, lscv.intercept_
Example #7
0
def score_lasso(X,
                y,
                rules: List[str],
                alphas=None,
                cv=3,
                max_rules=2000,
                random_state=None) -> Tuple[List[Rule], Lasso]:
    if alphas is None:
        alphas = _alpha_grid(X, y)

    coef_zero_threshold = 1e-6 / np.mean(np.abs(y))
    mse_cv_scores = []
    nonzero_rule_coefs_count = []
    kf = KFold(cv)
    for alpha in alphas:  # alphas are sorted from largest to smallest
        m = Lasso(alpha=alpha, random_state=random_state)
        mse_cv = 0
        for train_index, test_index in kf.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            m.fit(X_train, y_train)
            mse_cv += np.mean((m.predict(X_test) - y_test)**2)

        m.fit(X, y)

        rule_count = sum(np.abs(m.coef_) > coef_zero_threshold)
        if rule_count > max_rules:
            break
        nonzero_rule_coefs_count.append(rule_count)
        mse_cv_scores.append(mse_cv / cv)

    best_alpha = alphas[np.argmin(mse_cv_scores)]
    lscv = Lasso(alpha=best_alpha, random_state=random_state, max_iter=2000)
    lscv.fit(X, y)

    coefs = list(lscv.coef_[:-len(rules)])
    support = np.sum(X[:, -len(rules):], axis=0) / X.shape[0]

    nonzero_rules = []
    for r, w, s in zip(rules, lscv.coef_[-len(rules):], support):
        if abs(w) > coef_zero_threshold:
            nonzero_rules.append(Rule(r, args=[w], support=s))
            coefs.append(w)

    return nonzero_rules, coefs, lscv.intercept_
Example #8
0
def score_precision_recall(X,
              y,
              rules: List[List[str]],
              samples: List[List[int]],
              features: List[List[int]],
              feature_names: List[str],
              oob: bool = True) -> List[Rule]:

    scored_rules = []

    for curr_rules, curr_samples, curr_features in zip(rules, samples, features):

        # Create mask for OOB samples
        mask = ~indices_to_mask(curr_samples, X.shape[0])
        if sum(mask) == 0:
            if oob:
                warn(
                    "OOB evaluation not possible: doing it in-bag. Performance evaluation is likely to be wrong"
                    " (overfitting) and selected rules are likely to not perform well! Please use max_samples < 1."
                )
            mask = curr_samples

        # XXX todo: idem without dataframe

        X_oob = pd.DataFrame(
            (X[mask, :])[:, curr_features],
            columns=np.array(feature_names)[curr_features]
        )

        if X_oob.shape[1] <= 1:  # otherwise pandas bug (cf. issue #16363)
            return []

        y_oob = y[mask]
        y_oob = np.array((y_oob != 0))

        # Add OOB performances to rules:
        scored_rules += [
            Rule(r, args=_eval_rule_perf(r, X_oob, y_oob))
            for r in set(curr_rules)
        ]

    return scored_rules
Example #9
0
def score_lasso(X, y, rules: List[str], Cs, cv,
                random_state) -> Tuple[List[Rule], LassoCV]:
    if Cs is None:
        n_alphas = 100
        alphas = None
    elif hasattr(Cs, "__len__"):
        n_alphas = None
        alphas = 1. / Cs
    else:
        n_alphas = Cs
        alphas = None
    lscv = LassoCV(n_alphas=n_alphas,
                   alphas=alphas,
                   cv=cv,
                   random_state=random_state)
    lscv.fit(X, y)

    rules = [
        Rule(r, args=[w]) for r, w in zip(rules, lscv.coef_[-len(rules):])
    ]
    return rules, lscv
Example #10
0
def prune_mins(rules: List[Rule], precision_min: float, recall_min: float) -> List[Rule]:
    # Factorize rules before semantic tree filtering
    rules_ = [tuple(rule) for rule in rules]
    rules_dict = {}

    # keep only rules verifying precision_min and recall_min:
    for rule, score in rules_:
        if score[0] >= precision_min and score[1] >= recall_min:
            if rule in rules_dict:
                # update the score to the new mean
                c = rules_dict[rule][2] + 1
                b = rules_dict[rule][1] + 1. / c * (
                        score[1] - rules_dict[rule][1])
                a = rules_dict[rule][0] + 1. / c * (
                        score[0] - rules_dict[rule][0])

                rules_dict[rule] = (a, b, c)
            else:
                rules_dict[rule] = (score[0], score[1], 1)

    rule_tuple_list = sorted(rules_dict.items(), key=lambda x: (x[1][0], x[1][1]), reverse=True)
    return [Rule(rule, args=scores) for rule, scores in rule_tuple_list]
Example #11
0
    def fit(self, X, y, feature_names=None, sample_weight=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X. Has to follow the convention 0 for
            normal data, 1 for anomalies.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples, typically
            the amount in case of transactions data. Used to grow regression
            trees producing further rules to be tested.
            If not provided, then each sample is given unit weight.

        Returns
        -------
        self : object
            Returns self.
        """

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_ = X.shape[1]
        self.classes_ = unique_labels(y)

        self.feature_dict_ = get_feature_dict(X.shape[1], feature_names)
        self.feature_placeholders = list(self.feature_dict_.keys())
        self.feature_names = list(self.feature_dict_.values())

        n_train = y.shape[0]
        w = np.ones(n_train) / n_train
        self.estimators_ = []
        self.estimator_weights_ = []
        self.estimator_errors_ = []
        self.feature_names = feature_names
        for _ in range(self.n_estimators):
            # Fit a classifier with the specific weights
            clf = self.estimator()
            clf.fit(X, y, sample_weight=w)  # uses w as the sampling weight!
            preds = clf.predict(X)

            # Indicator function
            miss = preds != y

            # Equivalent with 1/-1 to update weights
            miss2 = np.ones(miss.size)
            miss2[~miss] = -1

            # Error
            err_m = np.dot(w, miss) / sum(w)
            if err_m < 1e-3:
                return self

            # Alpha
            alpha_m = 0.5 * np.log((1 - err_m) / float(err_m))

            # New weights
            w = np.multiply(w, np.exp([float(x) * alpha_m
                                       for x in miss2]))

            self.estimators_.append(deepcopy(clf))
            self.estimator_weights_.append(alpha_m)
            self.estimator_errors_.append(err_m)

        rules = []

        for est, est_weight in zip(self.estimators_, self.estimator_weights_):
            if type(clf) == DecisionTreeClassifier:
                est_rules_values = tree_to_rules(est, self.feature_placeholders, prediction_values=True)
                est_rules = list(map(lambda x: x[0], est_rules_values))

                # BRS scores are difference between class 1 % and class 0 % in a node
                est_values = np.array(list(map(lambda x: x[1], est_rules_values)))
                rule_scores = (est_values[:, 1] - est_values[:, 0]) / est_values.sum(axis=1)

                compos_score = est_weight * rule_scores
                rules += [Rule(r, args=[w]) for (r, w) in zip(est_rules, compos_score)]

            if type(clf) == SlipperClassifier:
                # SLIPPER uses uniform confidence over in rule observations
                est_rule = dict_to_rule(est.rule, est.feature_dict)
                rules += [Rule(est_rule, args=[est_weight])]

        self.rules_without_feature_names_ = rules
        self.rules_ = [
            replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_
        ]
        self.complexity_ = self._get_complexity()
        return self
Example #12
0
    def fit(self,
            X,
            y,
            feature_names: list = None,
            undiscretized_features=[],
            verbose=False):
        """Fit rule lists to data

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data

        y : array_like, shape = [n_samples]
            Labels
            
        feature_names : array_like, shape = [n_features], optional (default: [])
            String labels for each feature.
            If empty and X is a DataFrame, column labels are used.
            If empty and X is not a DataFrame, then features are simply enumerated
            
        undiscretized_features : array_like, shape = [n_features], optional (default: [])
            String labels for each feature which is NOT to be discretized.
            If empty, all numeric features are discretized
            
        verbose : bool
            Currently doesn't do anything

        Returns
        -------
        self : returns an instance of self.
        """
        self.seed()

        if len(set(y)) != 2:
            raise Exception(
                "Only binary classification is supported at this time!")

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_in_ = X.shape[1]
        self.classes_ = unique_labels(y)

        self.feature_dict_ = get_feature_dict(X.shape[1], feature_names)
        self.feature_placeholders = np.array(list(self.feature_dict_.keys()))
        self.feature_names = np.array(list(self.feature_dict_.values()))

        itemsets, self.discretizer = extract_fpgrowth(
            X,
            y,
            feature_names=self.feature_placeholders,
            minsupport=self.minsupport,
            maxcardinality=self.maxcardinality,
            undiscretized_features=undiscretized_features,
            disc_strategy=self.disc_strategy,
            disc_kwargs=self.disc_kwargs,
            verbose=verbose)
        X_df_onehot = self.discretizer.transform(X)

        # Now form the data-vs.-lhs set
        # X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
        for c in X_df_onehot.columns:
            X_df_onehot[c] = [
                c if x == 1 else '' for x in list(X_df_onehot[c])
            ]
        X = [{}] * (len(itemsets) + 1)
        X[0] = set(range(
            len(X_df_onehot)))  # the default rule satisfies all data
        for (j, lhs) in enumerate(itemsets):
            X[j + 1] = set([
                i for (i, xi) in enumerate(X_df_onehot.values)
                if set(lhs).issubset(xi)
            ])

        # now form lhs_len
        lhs_len = [0]
        for lhs in itemsets:
            lhs_len.append(len(lhs))
        nruleslen = Counter(lhs_len)
        lhs_len = np.array(lhs_len)
        itemsets_all = ['null']
        itemsets_all.extend(itemsets)

        Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = (
            X, np.vstack((1 - np.array(y),
                          y)).T.astype(int), nruleslen, lhs_len, itemsets_all)

        permsdic = defaultdict(
            default_permsdic)  # We will store here the MCMC results
        # Do MCMC
        res, Rhat = run_bdl_multichain_serial(self.max_iter,
                                              self.thinning,
                                              self.alpha,
                                              self.listlengthprior,
                                              self.listwidthprior,
                                              Xtrain,
                                              Ytrain,
                                              nruleslen,
                                              lhs_len,
                                              self.maxcardinality,
                                              permsdic,
                                              self.burnin,
                                              self.n_chains,
                                              [None] * self.n_chains,
                                              verbose=self.verbose,
                                              seed=self.random_state)

        # Merge the chains
        permsdic = merge_chains(res)

        # The point estimate, BRL-point
        self.d_star = get_point_estimate(
            permsdic,
            lhs_len,
            Xtrain,
            Ytrain,
            self.alpha,
            nruleslen,
            self.maxcardinality,
            self.listlengthprior,
            self.listwidthprior,
            verbose=self.verbose)  # get the point estimate

        if self.d_star:
            # Compute the rule consequent
            self.theta, self.ci_theta = get_rule_rhs(Xtrain, Ytrain,
                                                     self.d_star, self.alpha,
                                                     True)

        self.final_itemsets = np.array(self.itemsets,
                                       dtype=object)[self.d_star]
        rule_strs = itemsets_to_rules(self.final_itemsets)
        self.rules_without_feature_names_ = [Rule(r) for r in rule_strs]
        self.rules_ = [
            replace_feature_name(rule, self.feature_dict_)
            for rule in self.rules_without_feature_names_
        ]

        self.complexity_ = self._get_complexity()

        return self