Exemple #1
0
def test_performance_not_deteriorate():
    '''Compare the model performance to baselines.

    It's a bit unclear what to compare against since performance
    varies widely across models (in mse; vanilla settings):
    decision tree regressor: 6946
    random forest regressor: 2970
    linear model: 2820
    '''
    clf = SkopeRules(regression=True,
                     max_depth_duplication=None,
                     max_depth=X.shape[1] // 3,
                     n_estimators=850,
                     precision_min=0.,
                     recall_min=.0,
                     feature_names=feature_names)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    # comparing to a baseline from linear regression:
    assert mse < 2820
Exemple #2
0
def test_performances():
    X, y = make_blobs(n_samples=1000, random_state=0, centers=2)

    # make labels imbalanced by remove all but 100 instances from class 1
    indexes = np.ones(X.shape[0]).astype(bool)
    ind = np.array([False] * 100 + list(((y == 1)[100:])))
    indexes[ind] = 0
    X = X[indexes]
    y = y[indexes]
    n_samples, n_features = X.shape

    clf = SkopeRules()
    # fit
    clf.fit(X, y)
    # with lists
    clf.fit(X.tolist(), y.tolist())
    y_pred = clf.predict(X)
    assert_equal(y_pred.shape, (n_samples, ))
    # training set performance
    assert_greater(accuracy_score(y, y_pred), 0.83)

    # decision_function agrees with predict
    decision = -clf.decision_function(X)
    assert_equal(decision.shape, (n_samples, ))
    dec_pred = (decision.ravel() < 0).astype(np.int)
    assert_array_equal(dec_pred, y_pred)
Exemple #3
0
def test_can_predict():
    clf = SkopeRules(regression=True,
                     max_depth_duplication=2,
                     n_estimators=30,
                     precision_min=0.20,
                     recall_min=0.20,
                     feature_names=feature_names)
    clf.fit(X, y)
    clf.predict(X)
Exemple #4
0
def test_creates_rules():
    clf = SkopeRules(regression=True,
                     max_depth_duplication=2,
                     n_estimators=30,
                     precision_min=0.0,
                     recall_min=0.0,
                     feature_names=feature_names)
    clf.fit(X, y)
    rules = clf.rules_
    assert len(rules) > 0
def KfoldAcc(X, y, multiclass=False, k=10):  #then oversample pos
    kf = KFold(n_splits=10, shuffle=True)
    accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  #test set

        neigh = KNeighborsClassifier()
        neigh.fit(X_train, y_train)
        neigh_y_pred = neigh.predict(X_test)
        accuracy['neigh'].append(
            accuracy_score(y_test,
                           neigh_y_pred,
                           normalize=True,
                           sample_weight=None))

        print('---------')
        tree = DecisionTreeClassifier()
        tree.fit(X_train, y_train)
        tree_y_pred = tree.predict(X_test)
        accuracy['tree'].append(
            accuracy_score(y_test,
                           tree_y_pred,
                           normalize=True,
                           sample_weight=None))

        print('---------')
        naive = GaussianNB()
        naive.fit(X_train, y_train)
        naive_y_pred = naive.predict(X_test)
        accuracy['naive'].append(
            accuracy_score(y_test,
                           naive_y_pred,
                           normalize=True,
                           sample_weight=None))

        print('---------')
        rule = SkopeRules()
        if multiclass is True:
            rule = OneVsRestClassifier(rule)
        rule.fit(X_train, y_train)
        rules_y_pred = rule.predict(X_test)
        accuracy['rule'].append(
            accuracy_score(y_test,
                           rules_y_pred,
                           normalize=True,
                           sample_weight=None))

    return accuracy
Exemple #6
0
def test_deduplication_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]]
    y = [0] * 6 + [1] * 2
    X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5],
              [5, -7]]
    # Test LOF
    clf = SkopeRules(random_state=rng, max_samples=1., max_depth_duplication=3)
    clf.fit(X, y)
    decision_func = clf.decision_function(X_test)
    rules_vote = clf.rules_vote(X_test)
    score_top_rules = clf.score_top_rules(X_test)
    pred = clf.predict(X_test)
    pred_score_top_rules = clf.predict_top_rules(X_test, 1)
Exemple #7
0
def test_skope_rules_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]]
    y = [0] * 6 + [1] * 2
    X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5],
              [5, -7]]
    # Test LOF
    clf = SkopeRules(random_state=rng, max_samples=1.)
    clf.fit(X, y)
    decision_func = clf.decision_function(X_test)
    rules_vote = clf.rules_vote(X_test)
    separate_rules_score = clf.separate_rules_score(X_test)
    pred = clf.predict(X_test)
    # assert detect outliers:
    assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
    assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2]))
    assert_greater(np.min(separate_rules_score[-2:]),
                   np.max(separate_rules_score[:-2]))
    assert_array_equal(pred, 6 * [0] + 2 * [1])
Exemple #8
0
    def _getSkopeRules(X_train, y_train, model_params):
        # Rules
        print("Obtaining Rules using SkopeRules...")
        clf = SkopeRules(**model_params)
        clf.fit(X_train, y_train)
        rules = clf.rules_

        if len(rules) > 0:
            print("Checking inliers inside hypercubes...")
            df_rules = pd.DataFrame({
                "rule": [v[0].replace(" and ", " & ") for v in rules],
                "precision": [v[1][0] for v in rules],
                "recall": [v[1][1] for v in rules],
                "n_points_correct": [v[1][2] for v in rules],
            })
            if not df_rules.empty:
                df_rules["size_rules"] = df_rules.apply(
                    lambda x: len(x["rule"].split("&")), axis=1)
            else:
                df_rules["size_rules"] = 0
            rules = [v[0].replace(" and ", " & ") for v in rules]

            # Obtain rules in df format
            if len(rules) > 0:
                print("Turning rules to hypercubes...")
                df_rules_results = turn_rules_to_df(list_rules=rules,
                                                    list_cols=feature_cols)

                df_rules_pruned = simplifyRules(df_rules_results,
                                                categorical_cols)
                df_rules_pruned = df_rules_pruned.reset_index().merge(
                    df_rules.reset_index()[["index", "size_rules"]],
                    how="left")
                df_rules_pruned.index = df_rules_pruned["index"]
                df_rules_pruned = df_rules_pruned.drop(columns=["index"],
                                                       errors="ignore")
                df_rules_results = df_rules_pruned
            else:
                df_rules_results = pd.DataFrame()
            return df_rules_results
Exemple #9
0
def main():
    mail = get_feat_scores()  #panda table
    train, test = train_test_split(mail, test_size=0.3)  #split up data
    x_train = train.drop(columns=['label'])  #remove labels from test x
    y_train = train.drop(columns=['message', 'sf', 'hf'])
    cv = CountVectorizer(input='content',
                         stop_words=stp.words('english'),
                         ngram_range=(1, 2))
    x_tr = cv.fit_transform(
        x_train.message)  #vectorize x_train text for algorithm
    skr = SkopeRules(n_estimators=30, feature_names=['sf', 'hf'])  #algorithm
    y_train = y_train.to_numpy().ravel(
    )  #turn y_train into a 1d array for algorithm
    y_train = y_train.astype('int')
    skr.fit(x_tr.toarray(), y_train)
    #test data
    x_test = train.drop(columns=['label'])
    y_test = train.drop(columns=['message', 'sf', 'hf'])
    x_tst = cv.transform(x_test.message)
    y_test = y_test.to_numpy().ravel()
    y_test = y_test.astype('int')
    y_score = skr.score_top_rules(x_tst.toarray())
    #metrics
    recall_scr = recall_score(y_test, y_score, average='micro')
    f1_scr = f1_score(y_test, y_score, average='micro')
    pr_score = precision_score(y_test, y_score, average='micro')
    print("recall: " + str(recall_scr))
    print("f1: " + str(f1_scr))
    print("precision: " + str(pr_score))
    #plot
    precision, recall, r = precision_recall_curve(y_test, y_score)
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall curve')
    plt.show()
Exemple #10
0
def OverSampleKfold(data, k=10):
    data_np = data.to_numpy()
    X = data_np[:, 0:18]
    y = data_np[:, 18]
    sm = SMOTE()
    kf = KFold(n_splits=k, shuffle=True)
    accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  #test set
        print('num of pos:', np.sum(y_train), ', num of neg:',
              y_train.size - np.sum(y_train))

        X_over, y_over = sm.fit_resample(X_train,
                                         y_train)  #oversampled train set
        print('oversample:', X_over.shape, y_over.shape)
        print('---------------------------------------')

        neigh = KNeighborsClassifier()
        neigh.fit(X_over, y_over)
        neigh_y_pred = neigh.predict(X_test)
        neigh_y_score = neigh.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['neigh'].append(
            recall_score(y_test, neigh_y_pred, labels=None, pos_label=1))
        accuracy['neigh'].append(
            accuracy_score(y_test,
                           neigh_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['neigh'].append(roc_auc_score(y_test, neigh_y_score))

        print('---------')
        tree = DecisionTreeClassifier()
        tree.fit(X_over, y_over)
        tree_y_pred = tree.predict(X_test)
        tree_y_score = tree.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['tree'].append(
            recall_score(y_test, tree_y_pred, labels=None, pos_label=1))
        accuracy['tree'].append(
            accuracy_score(y_test,
                           tree_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['tree'].append(roc_auc_score(y_test, tree_y_score))

        print('---------')
        naive = GaussianNB()
        naive.fit(X_over, y_over)
        naive_y_pred = naive.predict(X_test)
        naive_y_score = naive.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['naive'].append(
            recall_score(y_test, naive_y_pred, labels=None, pos_label=1))
        accuracy['naive'].append(
            accuracy_score(y_test,
                           naive_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['naive'].append(roc_auc_score(y_test, naive_y_score))

        print('---------')
        rule = SkopeRules(feature_names=data.columns.to_list()[0:18])
        rule.fit(X_over, y_over)
        rules_y_pred = rule.predict(X_test)
        rule_y_score = rule.rules_vote(X_test)
        recall['rule'].append(
            recall_score(y_test, rules_y_pred, labels=None, pos_label=1))
        accuracy['rule'].append(
            accuracy_score(y_test,
                           rules_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['rule'].append(roc_auc_score(y_test, rule_y_score))

    return accuracy, recall, auc
Exemple #11
0
def BalanceSampleKfold(data, k=10):
    data_np = data.to_numpy()
    X = data_np[:, 0:18]
    y = data_np[:, 18]
    rus_balance = RandomUnderSampler(
        sampling_strategy=0.20)  #truncate neg to 5*#pos
    sm_balance = SMOTE()  #then oversample pos
    kf = KFold(n_splits=10, shuffle=True)

    accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  #test set
        print('num of pos:', np.sum(y_train), ', num of neg:',
              y_train.size - np.sum(y_train))
        X_bal, y_bal = rus_balance.fit_resample(X_train,
                                                y_train)  #BALANCED SAMPLE
        print('1.under:')
        print('num of pos:', np.sum(y_bal), ', num of neg:',
              y_bal.size - np.sum(y_bal))
        X_bal, y_bal = sm_balance.fit_resample(X_bal, y_bal)
        print('2.over:')
        print('num of pos:', np.sum(y_bal), ', num of neg:',
              y_bal.size - np.sum(y_bal))
        print('---------------------------------------')

        neigh = KNeighborsClassifier()
        neigh.fit(X_bal, y_bal)
        neigh_y_pred = neigh.predict(X_test)
        neigh_y_score = neigh.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['neigh'].append(
            recall_score(y_test, neigh_y_pred, labels=None, pos_label=1))
        accuracy['neigh'].append(
            accuracy_score(y_test,
                           neigh_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['neigh'].append(roc_auc_score(y_test, neigh_y_score))

        print('---------')
        tree = DecisionTreeClassifier()
        tree.fit(X_bal, y_bal)
        tree_y_pred = tree.predict(X_test)
        tree_y_score = tree.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['tree'].append(
            recall_score(y_test, tree_y_pred, labels=None, pos_label=1))
        accuracy['tree'].append(
            accuracy_score(y_test,
                           tree_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['tree'].append(roc_auc_score(y_test, tree_y_score))

        print('---------')
        naive = GaussianNB()
        naive.fit(X_bal, y_bal)
        naive_y_pred = naive.predict(X_test)
        naive_y_score = naive.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['naive'].append(
            recall_score(y_test, naive_y_pred, labels=None, pos_label=1))
        accuracy['naive'].append(
            accuracy_score(y_test,
                           naive_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['naive'].append(roc_auc_score(y_test, naive_y_score))

        print('---------')
        rule = SkopeRules(feature_names=data.columns.to_list()[0:18])
        rule.fit(X_bal, y_bal)
        rules_y_pred = rule.predict(X_test)
        rule_y_score = rule.rules_vote(X_test)
        recall['rule'].append(
            recall_score(y_test, rules_y_pred, labels=None, pos_label=1))
        accuracy['rule'].append(
            accuracy_score(y_test,
                           rules_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['rule'].append(roc_auc_score(y_test, rule_y_score))

    return accuracy, recall, auc
Exemple #12
0
from sklearn.datasets import load_iris
from skrules import SkopeRules
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd

dataset = load_iris()
print(dataset)

feature_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

clf = SkopeRules(max_depth_duplication=2,
                 n_estimators=30,
                 precision_min=0.3,
                 recall_min=0.1,
                 feature_names=feature_names)

for idx, species in enumerate(dataset.target_names):
    X, y = dataset.data, dataset.target
    clf.fit(X, y == idx)
    rules = clf.rules_[0:3]
    print("Rules for iris", species)
    for rule in rules:
        print(rule)
    print()
    print(20*'=')
    print()
Exemple #13
0
class DecisionListClassifier(ClassifierMixin, ExplainerMixin):
    """ Decision List Classifier

    Currently a slight variant of SkopeRules from skope-rules.
    https://github.com/scikit-learn-contrib/skope-rules

    """

    available_explanations = ["global", "local"]
    explainer_type = "model"

    def __init__(self, feature_names=None, feature_types=None, **kwargs):
        """ Initializes skope rules.

        Args:
            **kwargs: Keyword arguments to be passed to SkopeRules
                in skope-rules.
        """
        self.feature_names = feature_names
        self.feature_types = feature_types
        self.kwargs = kwargs

    def fit(self, X, y):
        """ Fits model to provided instances.

        Args:
            X: Numpy array for training instances.
            y: Numpy array as training labels.

        Returns:
            Itself.
        """
        X, y, self.feature_names, self.feature_types = unify_data(
            X, y, self.feature_names, self.feature_types
        )
        self.feature_index_ = [
            "feature_" + str(i) for i, v in enumerate(self.feature_names)
        ]
        self.feature_map_ = {
            v: self.feature_names[i] for i, v in enumerate(self.feature_index_)
        }
        self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs)

        self.classes_, y = np.unique(y, return_inverse=True)
        self.sk_model_.fit(X, y)
        self.pos_ratio_ = np.mean(y)

        # Extract rules
        self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = self._extract_rules(
            self.sk_model_.rules_
        )

        self.global_selector = gen_global_selector(
            X, self.feature_names, self.feature_types, None
        )

        return self

    def predict(self, X):
        """ Predicts on provided instances.

        Args:
            X: Numpy array for instances.

        Returns:
            Predicted class label per instance.
        """

        X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types)
        scores = self.predict_proba(X)
        return self.classes_[np.argmax(scores, axis=1)]

    def _scores(self, X):
        df = pd.DataFrame(X, columns=self.feature_index_)
        selected_rules = self.internal_rules_

        scores = np.ones(X.shape[0]) * np.inf
        for k, r in enumerate(selected_rules):
            matched_idx = list(df.query(r[0]).index)
            scores[matched_idx] = np.minimum(k, scores[matched_idx])
        scores[np.isinf(scores)] = len(selected_rules)
        scores = scores.astype("int64")

        return scores

    def predict_proba(self, X):
        """ Provides probability estimates on provided instances.

        Args:
            X: Numpy array for instances.

        Returns:
            Probability estimate of instance for each class.
        """

        X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types)
        scores = self._scores(X)
        prec_ar = np.array(self.prec_)
        return np.c_[1.0 - prec_ar[scores], prec_ar[scores]]

    def _extract_rules(self, rules):
        rules = deepcopy(rules)
        rules = list(sorted(rules, key=lambda x: x[1][0], reverse=True))

        rule_li = []
        prec_li = []
        recall_li = []
        predict_li = []
        features_dict = {feat: [] for feat in self.feature_names}

        def extract_orig_features(pattern, rule):
            feature_set = set()
            for m in re.finditer(pattern, rule):
                orig_feature = self.feature_map_[m.group(1)]
                feature_set.add(orig_feature)
            return feature_set

        for indx, rule_rec in enumerate(rules):
            rule = rule_rec[0]
            rule_round = " ".join(
                [
                    "{0:.2f}".format(float(x)) if x.replace(".", "", 1).isdigit() else x
                    for x in rule.split(" ")
                ]
            )
            pattern = r"(feature_[0-9]+)"
            feature_set = extract_orig_features(pattern, rule_round)
            rule_fix = re.sub(
                pattern, lambda m: self.feature_map_[m.group(1)], rule_round
            )
            rule_li.append(rule_fix)
            prec_li.append(rule_rec[1][0])
            recall_li.append(rule_rec[1][1])
            predict_li.append(1.0)

            for feat in feature_set:
                features_dict[feat].append(indx)

        # Provide default rule
        rule_li.append("No Rules Triggered")
        prec_li.append(self.pos_ratio_)
        recall_li.append(1.0)
        predict_li.append(0.0)

        return rules, rule_li, prec_li, recall_li, features_dict

    def explain_local(self, X, y=None, name=None):
        if name is None:
            name = gen_name_from_class(self)

        X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types)

        scores = self._scores(X)
        outcomes = self.predict(X)
        prob_scores = self.predict_proba(X)

        data_dicts = []
        for idx, score in enumerate(scores):
            data_dict = {
                "type": "rule",
                "rule": [self.rules_[score]],
                "precision": [self.prec_[score]],
                "recall": [self.recall_[score]],
                "outcome": [outcomes[idx]],
            }
            data_dicts.append(data_dict)

        internal_obj = {"overall": None, "specific": data_dicts}
        selector = gen_local_selector(X, y, prob_scores[:, 1])

        return RulesExplanation(
            "local",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=selector,
        )

    def explain_global(self, name=None):
        """ Provides global explanation for model.

        Args:
            name: User-defined explanation name.

        Returns:
            An explanation object,
            visualizing feature-value pairs as horizontal bar chart.
        """
        if name is None:
            name = gen_name_from_class(self)

        # Extract rules
        rules, prec, recall, feat_rule_map = (
            self.rules_,
            self.prec_,
            self.recall_,
            self.feat_rule_map_,
        )

        outcomes = [self.classes_[1]] * (len(self.rules_) - 1)
        # Add the zero case for the default rule
        outcomes.append(self.classes_[0])
        overall_data_dict = {
            "type": "rule",
            "rule": rules,
            "precision": prec,
            "recall": recall,
            "outcome": outcomes,
        }
        data_dicts = [
            {
                "type": "rule",
                "rule": [rules[i] for i in feat_rule_map[feature]],
                "precision": [prec[i] for i in feat_rule_map[feature]],
                "recall": [recall[i] for i in feat_rule_map[feature]],
                "outcome": [outcomes[i] for i in feat_rule_map[feature]],
            }
            for feature in self.feature_names
        ]

        internal_obj = {"overall": overall_data_dict, "specific": data_dicts}

        return RulesExplanation(
            "global",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=self.global_selector,
        )
Exemple #14
0
class DecisionListClassifier(ClassifierMixin, ExplainerMixin):
    """ Decision List Classifier

    Currently a slight variant of SkopeRules from skope-rules.
    https://github.com/scikit-learn-contrib/skope-rules

    """
    available_explanations = ['global', 'local']
    explainer_type = 'model'

    def __init__(self, feature_names=None, feature_types=None, **kwargs):
        """ Initializes skope rules.

        Args:
            **kwargs: Keyword arguments to be passed to SkopeRules
                in skope-rules.
        """
        self.feature_names = feature_names
        self.feature_types = feature_types
        self.kwargs = kwargs

    def fit(self, X, y):
        """ Fits model to provided instances.

        Args:
            X: Numpy array for training instances.
            y: Numpy array as training labels.

        Returns:
            Itself.
        """
        X, y, self.feature_names, self.feature_types = unify_data(
            X, y, self.feature_names, self.feature_types)
        self.feature_index_ = [
            'feature_' + str(i) for i, v in enumerate(self.feature_names)
        ]
        self.feature_map_ = {
            v: self.feature_names[i]
            for i, v in enumerate(self.feature_index_)
        }
        self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs)

        self.classes_, y = np.unique(y, return_inverse=True)
        self.sk_model_.fit(X, y)
        self.pos_ratio_ = np.mean(y)

        # Extract rules
        self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = \
            self._extract_rules(self.sk_model_.rules_)

        self.global_selector = gen_global_selector(X, self.feature_names,
                                                   self.feature_types, None)

        return self

    def predict(self, X):
        """ Predicts on provided instances.

        Args:
            X: Numpy array for instances.

        Returns:
            Predicted class label per instance.
        """

        X, _, _, _ = unify_data(X, None, self.feature_names,
                                self.feature_types)
        scores = self.predict_proba(X)
        return self.classes_[np.argmax(scores, axis=1)]

    def _scores(self, X):
        df = pd.DataFrame(X, columns=self.feature_index_)
        selected_rules = self.internal_rules_

        scores = np.ones(X.shape[0]) * np.inf
        for k, r in enumerate(selected_rules):
            matched_idx = list(df.query(r[0]).index)
            scores[matched_idx] = np.minimum(k, scores[matched_idx])
        scores[np.isinf(scores)] = len(selected_rules)
        scores = scores.astype('int64')

        return scores

    def predict_proba(self, X):
        """ Provides probability estimates on provided instances.

        Args:
            X: Numpy array for instances.

        Returns:
            Probability estimate of instance for each class.
        """

        X, _, _, _ = unify_data(X, None, self.feature_names,
                                self.feature_types)
        scores = self._scores(X)
        prec_ar = np.array(self.prec_)
        return np.c_[1.0 - prec_ar[scores], prec_ar[scores]]

    def _extract_rules(self, rules):
        rules = deepcopy(rules)
        rules = list(sorted(rules, key=lambda x: x[1][0], reverse=True))

        rule_li = []
        prec_li = []
        recall_li = []
        predict_li = []
        features_dict = {feat: [] for feat in self.feature_names}

        def extract_orig_features(pattern, rule):
            feature_set = set()
            for m in re.finditer(pattern, rule):
                orig_feature = self.feature_map_[m.group(1)]
                feature_set.add(orig_feature)
            return feature_set

        for indx, rule_rec in enumerate(rules):
            rule = rule_rec[0]
            rule_round = ' '.join([
                '{0:.2f}'.format(float(x))
                if x.replace('.', '', 1).isdigit() else x
                for x in rule.split(' ')
            ])
            pattern = r'(feature_[0-9]+)'
            feature_set = extract_orig_features(pattern, rule_round)
            rule_fix = re.sub(pattern, lambda m: self.feature_map_[m.group(1)],
                              rule_round)
            rule_li.append(rule_fix)
            prec_li.append(rule_rec[1][0])
            recall_li.append(rule_rec[1][1])
            predict_li.append(1.0)

            for feat in feature_set:
                features_dict[feat].append(indx)

        # Provide default rule
        rule_li.append('No Rules Triggered')
        prec_li.append(self.pos_ratio_)
        recall_li.append(1.0)
        predict_li.append(0.0)

        return rules, rule_li, prec_li, recall_li, features_dict

    def explain_local(self, X, y=None, name=None):
        if name is None:
            name = gen_name_from_class(self)

        X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types)

        scores = self._scores(X)
        outcomes = self.predict(X)
        prob_scores = self.predict_proba(X)

        data_dicts = []
        for idx, score in enumerate(scores):
            data_dict = {
                'type': 'rule',
                'rule': [self.rules_[score]],
                'precision': [self.prec_[score]],
                'recall': [self.recall_[score]],
                'outcome': [outcomes[idx]],
            }
            data_dicts.append(data_dict)

        internal_obj = {
            'overall': None,
            'specific': data_dicts,
        }
        selector = gen_local_selector(X, y, prob_scores[:, 1])

        return RulesExplanation('local',
                                internal_obj,
                                feature_names=self.feature_names,
                                feature_types=self.feature_types,
                                name=name,
                                selector=selector)

    def explain_global(self, name=None):
        """ Provides global explanation for model.

        Args:
            name: User-defined explanation name.

        Returns:
            An explanation object,
            visualizing feature-value pairs as horizontal bar chart.
        """
        if name is None:
            name = gen_name_from_class(self)

        # Extract rules
        rules, prec, recall, feat_rule_map = \
            self.rules_, self.prec_, self.recall_, self.feat_rule_map_

        outcomes = [self.classes_[1]] * (len(self.rules_) - 1)
        # Add the zero case for the default rule
        outcomes.append(self.classes_[0])
        overall_data_dict = {
            'type': 'rule',
            'rule': rules,
            'precision': prec,
            'recall': recall,
            'outcome': outcomes
        }
        data_dicts = [{
            'type': 'rule',
            'rule': [rules[i] for i in feat_rule_map[feature]],
            'precision': [prec[i] for i in feat_rule_map[feature]],
            'recall': [recall[i] for i in feat_rule_map[feature]],
            'outcome': [outcomes[i] for i in feat_rule_map[feature]],
        } for feature in self.feature_names]

        internal_obj = {
            'overall': overall_data_dict,
            'specific': data_dicts,
        }

        return RulesExplanation('global',
                                internal_obj,
                                feature_names=self.feature_names,
                                feature_types=self.feature_types,
                                name=name,
                                selector=self.global_selector)
Exemple #15
0
    naive = GaussianNB()
    rule = SkopeRules(feature_names= data.columns.to_list()[0:18])
    
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    accuracy_no['neigh'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
    
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    accuracy_no['tree'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
    
    naive.fit(X_train, y_train)
    y_pred = naive.predict(X_test)
    accuracy_no['naive'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
    
    rule.fit(X_train, y_train)
    y_pred = rule.predict(X_test)
    accuracy_no['rule'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
    
    
#feature sel w/ boruta
for train_index, test_index in kf.split(X1):
    X_train, X_test = X1[train_index], X1[test_index]
    y_train, y_test = y_bal[train_index], y_bal[test_index]    #test set
    
    neigh = KNeighborsClassifier()  ##
    tree = DecisionTreeClassifier()
    naive = GaussianNB()
    rule = SkopeRules()
    
    neigh.fit(X_train, y_train)
###############################################################################
# Getting rules with skrules
# ..................
#
# This part shows how SkopeRules can be fitted to detect credit defaults.
# Performances are compared with the random forest model previously trained.

# fit the model

clf = SkopeRules(
    similarity_thres=.9, max_depth=3, max_features=0.5,
    max_samples_features=0.5, random_state=rng, n_estimators=30,
    feature_names=feature_names, recall_min=0.02, precision_min=0.6
    )
clf.fit(X_train, y_train)

# in the separate_rules_score method, a score of k means that rule number k
# vote positively, but not rules 1, ..., k-1. It will allow us to plot
# performance of each rule separately on ROC and PR plots.
scoring = clf.separate_rules_score(X_test)

print(str(len(clf.rules_)) + ' rules have been built.')
print('The most precise rules are the following:')
print(clf.rules_[:5])

curves = [roc_curve, precision_recall_curve]
xlabels = ['False Positive Rate', 'Recall (True Positive Rate)']
ylabels = ['True Positive Rate (Recall)', 'Precision']

    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):

        orig_cols = list(X.names)

        import pandas as pd
        import numpy as np
        from skrules import SkopeRules
        from sklearn.preprocessing import OneHotEncoder
        from collections import Counter

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(experiment_id=self.context.experiment_id,
                                            tmp_dir=self.context.tmp_dir,
                                            experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folder
        tmp_folder = self._create_tmp_folder(logger)

        # Set up model
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            model = SkopeRules(max_depth_duplication=self.params["max_depth_duplication"],
                               n_estimators=self.params["n_estimators"],
                               precision_min=self.params["precision_min"],
                               recall_min=self.params["recall_min"],
                               max_samples=self.params["max_samples"],
                               max_samples_features=self.params["max_samples_features"],
                               max_depth=self.params["max_depth"],
                               max_features=self.params["max_features"],
                               min_samples_split=self.params["min_samples_split"],
                               bootstrap=self.params["bootstrap"],
                               bootstrap_features=self.params["bootstrap_features"],
                               random_state=self.params["random_state"],
                               feature_names=orig_cols)
        else:
            # Skopes doesn't work for regression
            loggerinfo(logger, "PASS, no skopes model")
            pass

        # Find the datatypes
        X = X.to_pandas()
        X.columns = orig_cols

        # Change continuous features to categorical
        X_datatypes = [str(item) for item in list(X.dtypes)]

        # Change all float32 values to float64
        for ii in range(len(X_datatypes)):
            if X_datatypes[ii] == 'float32':
                X = X.astype({orig_cols[ii]: np.float64})

        X_datatypes = [str(item) for item in list(X.dtypes)]

        # List the categorical and numerical features
        self.X_categorical = [orig_cols[col_count] for col_count in range(len(orig_cols)) if
                              (X_datatypes[col_count] == 'category') or (X_datatypes[col_count] == 'object')]
        self.X_numeric = [item for item in orig_cols if item not in self.X_categorical]

        # Find the levels and mode for each categorical feature
        # for use in the test set
        self.train_levels = {}
        for item in self.X_categorical:
            self.train_levels[item] = list(set(X[item]))
            self.train_mode[item] = Counter(X[item]).most_common(1)[0][0]

            # One hot encode the categorical features
        # And replace missing values with a Missing category
        if len(self.X_categorical) > 0:
            loggerinfo(logger, "PCategorical encode")

            for colname in self.X_categorical:
                X[colname] = list(X[colname].fillna("Missing"))
            self.enc = OneHotEncoder(handle_unknown='ignore')

            self.enc.fit(X[self.X_categorical])
            self.encoded_categories = list(self.enc.get_feature_names(input_features=self.X_categorical))

            X_enc = self.enc.transform(X[self.X_categorical]).toarray()

            X = pd.concat([X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories)], axis=1)

        # Replace missing values with a missing value code
        if len(self.X_numeric) > 0:

            for colname in self.X_numeric:
                X[colname] = list(X[colname].fillna(-999))

        model.fit(np.array(X), np.array(y))

        # Find the rule list
        self.rule_list = model.rules_

        # Calculate feature importances
        var_imp = []
        for var in orig_cols:
            var_imp.append(sum(int(var in item[0]) for item in self.rule_list))

        if max(var_imp) != 0:
            importances = list(np.array(var_imp) / max(var_imp))
        else:
            importances = [1] * len(var_imp)

        pd.DataFrame(model.rules_, columns=['Rule', '(Precision, Recall, nb)']).to_csv(
            os.path.join(tmp_folder, 'Skope_rules.csv'), index=False)

        self.mean_target = np.array(sum(y) / len(y))

        # Set model properties
        self.set_model_properties(model=model,
                                  features=list(X.columns),
                                  importances=importances,
                                  iterations=self.params['n_estimators'])
    if dat in ('http', 'smtp'):
        y = (y != b'normal.').astype(int)
        print_outlier_ratio(y)

    n_samples, n_features = X.shape
    n_samples_train = n_samples // 2
    X = X.astype(float)
    X_train = X[:n_samples_train, :]
    X_test = X[n_samples_train:, :]
    y_train = y[:n_samples_train]
    y_test = y[n_samples_train:]

    print('--- Fitting the SkopeRules estimator...')
    model = SkopeRules(n_estimators=5, max_depth=5, n_jobs=-1)
    tstart = time()
    model.fit(X_train, y_train)
    fit_time = time() - tstart
    tstart = time()

    scoring = -model.decision_function(X_test)  # the lower, the more abnormal

    print("--- Preparing the plot elements...")
    if with_decision_function_histograms:
        fig, ax = plt.subplots(3, sharex=True, sharey=True)
        bins = np.linspace(-0.5, 0.5, 200)
        ax[0].hist(scoring, bins, color='black')
        ax[0].set_title('Decision function for %s dataset' % dat)
        ax[1].hist(scoring[y_test == 0], bins, color='b', label='normal data')
        ax[1].legend(loc="lower right")
        ax[2].hist(scoring[y_test == 1], bins, color='r', label='outliers')
        ax[2].legend(loc="lower right")
Exemple #19
0
class DecisionListClassifier(ClassifierMixin,
                             ExplainerMixin):  # pragma: no cover
    """ Decision List Classifier

    Currently a slight variant of SkopeRules from skope-rules.
    https://github.com/scikit-learn-contrib/skope-rules

    """

    available_explanations = ["global", "local"]
    explainer_type = "model"

    def __init__(self, feature_names=None, feature_types=None, **kwargs):
        """ Initializes class.

        Args:
            feature_names: List of feature names.
            feature_types: List of feature types.
            **kwargs: Kwargs passed to wrapped SkopeRules at initialization time.
        """
        self.feature_names = feature_names
        self.feature_types = feature_types
        self.kwargs = kwargs

    def fit(self, X, y):
        """ Fits model to provided instances.

        Args:
            X: Numpy array for training instances.
            y: Numpy array as training labels.

        Returns:
            Itself.
        """
        from skrules import SkopeRules as SR

        X, y, self.feature_names, self.feature_types = unify_data(
            X, y, self.feature_names, self.feature_types)
        self.feature_index_ = [
            "feature_" + str(i) for i, v in enumerate(self.feature_names)
        ]
        self.feature_map_ = {
            v: self.feature_names[i]
            for i, v in enumerate(self.feature_index_)
        }
        self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs)

        self.classes_, y = np.unique(y, return_inverse=True)
        self.sk_model_.fit(X, y)
        self.pos_ratio_ = np.mean(y)

        # Extract rules
        (
            self.internal_rules_,
            self.rules_,
            self.prec_,
            self.recall_,
            self.feat_rule_map_,
        ) = self._extract_rules(self.sk_model_.rules_)

        self.global_selector = gen_global_selector(X, self.feature_names,
                                                   self.feature_types, None)

        return self

    def predict(self, X):
        """ Predicts on provided instances.

        Args:
            X: Numpy array for instances.

        Returns:
            Predicted class label per instance.
        """

        X, _, _, _ = unify_data(X, None, self.feature_names,
                                self.feature_types)
        scores = self.predict_proba(X)
        return self.classes_[np.argmax(scores, axis=1)]

    def _scores(self, X):
        df = pd.DataFrame(X, columns=self.feature_index_)
        selected_rules = self.internal_rules_

        scores = np.ones(X.shape[0]) * np.inf
        for k, r in enumerate(selected_rules):
            matched_idx = list(df.query(r[0]).index)
            scores[matched_idx] = np.minimum(k, scores[matched_idx])
        scores[np.isinf(scores)] = len(selected_rules)
        scores = scores.astype("int64")

        return scores

    def predict_proba(self, X):
        """ Provides probability estimates on provided instances.

        Args:
            X: Numpy array for instances.

        Returns:
            Probability estimate of instance for each class.
        """

        X, _, _, _ = unify_data(X, None, self.feature_names,
                                self.feature_types)
        scores = self._scores(X)
        prec_ar = np.array(self.prec_)
        return np.c_[1.0 - prec_ar[scores], prec_ar[scores]]

    def _extract_rules(self, rules):
        rules = deepcopy(rules)
        rules = list(sorted(rules, key=lambda x: x[1][0], reverse=True))

        rule_li = []
        prec_li = []
        recall_li = []
        predict_li = []
        features_dict = {feat: [] for feat in self.feature_names}

        def extract_orig_features(pattern, rule):
            feature_set = set()
            for m in re.finditer(pattern, rule):
                orig_feature = self.feature_map_[m.group(1)]
                feature_set.add(orig_feature)
            return feature_set

        for indx, rule_rec in enumerate(rules):
            rule = rule_rec[0]
            rule_round = " ".join([
                "{0:.2f}".format(float(x))
                if x.replace(".", "", 1).isdigit() else x
                for x in rule.split(" ")
            ])
            pattern = r"(feature_[0-9]+)"
            feature_set = extract_orig_features(pattern, rule_round)
            rule_fix = re.sub(pattern, lambda m: self.feature_map_[m.group(1)],
                              rule_round)
            rule_li.append(rule_fix)
            prec_li.append(rule_rec[1][0])
            recall_li.append(rule_rec[1][1])
            predict_li.append(1.0)

            for feat in feature_set:
                features_dict[feat].append(indx)

        # Provide default rule
        rule_li.append("No Rules Triggered")
        prec_li.append(self.pos_ratio_)
        recall_li.append(1.0)
        predict_li.append(0.0)

        return rules, rule_li, prec_li, recall_li, features_dict

    def explain_local(self, X, y=None, name=None):
        """ Provides local explanations for provided instances.

        Args:
            X: Numpy array for X to explain.
            y: Numpy vector for y to explain.
            name: User-defined explanation name.

        Returns:
            An explanation object.
        """
        if name is None:
            name = gen_name_from_class(self)

        X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types)

        scores = self._scores(X)
        outcomes = self.predict(X)
        prob_scores = self.predict_proba(X)

        data_dicts = []
        for idx, score in enumerate(scores):
            data_dict = {
                "type": "rule",
                "rule": [self.rules_[score]],
                "precision": [self.prec_[score]],
                "recall": [self.recall_[score]],
                "outcome": [outcomes[idx]],
            }
            data_dicts.append(data_dict)

        internal_obj = {"overall": None, "specific": data_dicts}
        selector = gen_local_selector(data_dicts, is_classification=True)

        return RulesExplanation(
            "local",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=selector,
        )

    def explain_global(self, name=None):
        """ Provides global explanation for model.

        Args:
            name: User-defined explanation name.

        Returns:
            An explanation object.
        """
        if name is None:
            name = gen_name_from_class(self)

        # Extract rules
        rules, prec, recall, feat_rule_map = (
            self.rules_,
            self.prec_,
            self.recall_,
            self.feat_rule_map_,
        )

        outcomes = [self.classes_[1]] * (len(self.rules_) - 1)
        # Add the zero case for the default rule
        outcomes.append(self.classes_[0])
        overall_data_dict = {
            "type": "rule",
            "rule": rules,
            "precision": prec,
            "recall": recall,
            "outcome": outcomes,
        }
        data_dicts = [{
            "type": "rule",
            "rule": [rules[i] for i in feat_rule_map[feature]],
            "precision": [prec[i] for i in feat_rule_map[feature]],
            "recall": [recall[i] for i in feat_rule_map[feature]],
            "outcome": [outcomes[i] for i in feat_rule_map[feature]],
        } for feature in self.feature_names]

        internal_obj = {"overall": overall_data_dict, "specific": data_dicts}

        return RulesExplanation(
            "global",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=self.global_selector,
        )
# Train the network, then make predictions on the test set and print the results
neural_network = compute_semi_supervised_learning(neural_network, X_train,
                                                  Y_train)
neural_network_pred = np.array(neural_network.predict_classes(
    np.array(X_test)))
print_statistics("Semi-Supervised Neural Network", neural_network_pred, Y_test)

# ************* Rule Model:  ************************
# Here we compare 3 nearest neighbour models on the validation set
# First skope rules model
rule_clf1 = SkopeRules(n_estimators=50,
                       precision_min=0.2,
                       recall_min=0.2,
                       feature_names=feature_names)
rule_clf1.fit(X_val, Y_val)
rule_clf1_ypred = rule_clf1.predict(X_test_val)
print_statistics("Rule Classifier - Skope Rules - Model 1", rule_clf1_ypred,
                 Y_test_val)
# Second skope rules model
rule_clf2 = SkopeRules(n_estimators=50,
                       precision_min=0.2,
                       recall_min=0.2,
                       feature_names=feature_names)
rule_clf2.fit(X_val, Y_val)
rule_clf2_ypred = rule_clf2.predict(X_test_val)
print_statistics("Rule Classifier - Skope Rules - Model 2", rule_clf2_ypred,
                 Y_test_val)
# Third skope rules model
rule_clf3 = SkopeRules(n_estimators=25,
                       precision_min=0.2,