Example #1
0
def test_performance_not_deteriorate():
    '''Compare the model performance to baselines.

    It's a bit unclear what to compare against since performance
    varies widely across models (in mse; vanilla settings):
    decision tree regressor: 6946
    random forest regressor: 2970
    linear model: 2820
    '''
    clf = SkopeRules(regression=True,
                     max_depth_duplication=None,
                     max_depth=X.shape[1] // 3,
                     n_estimators=850,
                     precision_min=0.,
                     recall_min=.0,
                     feature_names=feature_names)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    # comparing to a baseline from linear regression:
    assert mse < 2820
Example #2
0
def test_similarity_tree():
    # Test that rules are well splitted
    rules = [
        ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)),
        ("a <= 2 and b > 45 and c <= 3 and a > 4", (1, 1, 0)),
        ("a > 2 and b > 45", (0.5, 0.3, 0)),
        ("a > 2 and b > 40", (0.5, 0.2, 0)),
        ("a <= 2 and b <= 45", (1, 1, 0)),
        ("a > 2 and c <= 3", (1, 1, 0)),
        ("b > 45", (1, 1, 0)),
    ]

    sk = SkopeRules(max_depth_duplication=2)
    rulesets = sk._find_similar_rulesets(rules)
    # Assert some couples of rules are in the same bag
    idx_bags_rules = []
    for idx_rule, r in enumerate(rules):
        idx_bags_for_rule = []
        for idx_bag, bag in enumerate(rulesets):
            if r in bag:
                idx_bags_for_rule.append(idx_bag)
        idx_bags_rules.append(idx_bags_for_rule)

    assert_equal(idx_bags_rules[0], idx_bags_rules[1])
    assert_not_equal(idx_bags_rules[0], idx_bags_rules[2])
    # Assert the best rules are kept
    final_rules = sk.deduplicate(rules)
    assert_in(rules[0], final_rules)
    assert_in(rules[2], final_rules)
    assert_not_in(rules[3], final_rules)
Example #3
0
def test_skope_rules():
    """Check various parameter settings."""
    X_train = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3],
               [-4, -7]]
    y_train = [0] * 6 + [1] * 2
    X_test = np.array([[2, 1], [1, 1]])

    grid = ParameterGrid({
        "feature_names": [None, ['a', 'b']],
        "precision_min": [0.],
        "recall_min": [0.],
        "n_estimators": [1],
        "max_samples": [0.5, 4],
        "max_samples_features": [0.5, 2],
        "bootstrap": [True, False],
        "bootstrap_features": [True, False],
        "max_depth": [2],
        "max_features": ["auto", 1, 0.1],
        "min_samples_split": [2, 0.1],
        "n_jobs": [-1, 2]
    })

    with ignore_warnings():
        for params in grid:
            SkopeRules(random_state=rng, **params).fit(X_train,
                                                       y_train).predict(X_test)

    # additional parameters:
    SkopeRules(n_estimators=50,
               max_samples=1.,
               recall_min=0.,
               precision_min=0.).fit(X_train, y_train).predict(X_test)
Example #4
0
def test_f1_score():
    clf = SkopeRules()
    rule0 = ('a > 0', (0, 0, 0))
    rule1 = ('a > 0', (0.5, 0.5, 0))
    rule2 = ('a > 0', (0.5, 0, 0))

    assert_equal(clf.f1_score(rule0), 0)
    assert_equal(clf.f1_score(rule1), 0.5)
    assert_equal(clf.f1_score(rule2), 0)
Example #5
0
def test_creates_rules():
    clf = SkopeRules(regression=True,
                     max_depth_duplication=2,
                     n_estimators=30,
                     precision_min=0.0,
                     recall_min=0.0,
                     feature_names=feature_names)
    clf.fit(X, y)
    rules = clf.rules_
    assert len(rules) > 0
Example #6
0
def test_performances():
    X, y = make_blobs(n_samples=1000, random_state=0, centers=2)

    # make labels imbalanced by remove all but 100 instances from class 1
    indexes = np.ones(X.shape[0]).astype(bool)
    ind = np.array([False] * 100 + list(((y == 1)[100:])))
    indexes[ind] = 0
    X = X[indexes]
    y = y[indexes]
    n_samples, n_features = X.shape

    clf = SkopeRules()
    # fit
    clf.fit(X, y)
    # with lists
    clf.fit(X.tolist(), y.tolist())
    y_pred = clf.predict(X)
    assert_equal(y_pred.shape, (n_samples, ))
    # training set performance
    assert_greater(accuracy_score(y, y_pred), 0.83)

    # decision_function agrees with predict
    decision = -clf.decision_function(X)
    assert_equal(decision.shape, (n_samples, ))
    dec_pred = (decision.ravel() < 0).astype(np.int)
    assert_array_equal(dec_pred, y_pred)
Example #7
0
    def fit(self, X, y):
        """ Fits model to provided instances.

        Args:
            X: Numpy array for training instances.
            y: Numpy array as training labels.

        Returns:
            Itself.
        """
        from skrules import SkopeRules as SR

        X, y, self.feature_names, self.feature_types = unify_data(
            X, y, self.feature_names, self.feature_types)
        self.feature_index_ = [
            "feature_" + str(i) for i, v in enumerate(self.feature_names)
        ]
        self.feature_map_ = {
            v: self.feature_names[i]
            for i, v in enumerate(self.feature_index_)
        }
        self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs)

        self.classes_, y = np.unique(y, return_inverse=True)
        self.sk_model_.fit(X, y)
        self.pos_ratio_ = np.mean(y)

        # Extract rules
        self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = self._extract_rules(
            self.sk_model_.rules_)

        self.global_selector = gen_global_selector(X, self.feature_names,
                                                   self.feature_types, None)

        return self
Example #8
0
    def fit(self, X, y):
        """ Fits model to provided instances.

        Args:
            X: Numpy array for training instances.
            y: Numpy array as training labels.

        Returns:
            Itself.
        """
        X, y, self.feature_names, self.feature_types = unify_data(
            X, y, self.feature_names, self.feature_types
        )
        self.feature_index_ = [
            "feature_" + str(i) for i, v in enumerate(self.feature_names)
        ]
        self.feature_map_ = {
            v: self.feature_names[i] for i, v in enumerate(self.feature_index_)
        }
        self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs)

        self.classes_, y = np.unique(y, return_inverse=True)
        self.sk_model_.fit(X, y)
        self.pos_ratio_ = np.mean(y)

        # Extract rules
        self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = self._extract_rules(
            self.sk_model_.rules_
        )

        self.global_selector = gen_global_selector(
            X, self.feature_names, self.feature_types, None
        )

        return self
def ML_exp(X, y_true, feature_names):

    from sklearn import tree
    from sklearn.naive_bayes import GaussianNB
    from sklearn.neighbors import KNeighborsClassifier
    from skrules import SkopeRules
    from tensorflow.keras import optimizers

    clfs = {}
    clfs['KNN'] = KNeighborsClassifier(n_neighbors=3)
    clfs['DT'] = tree.DecisionTreeClassifier()
    clfs['NB'] = GaussianNB()
    clfs['RB'] = SkopeRules(max_depth_duplication=None,
                            n_estimators=30,
                            precision_min=0.6,
                            recall_min=0.01,
                            feature_names=feature_names)

    mlp = getMLP(X.shape[-1], num_class=2)
    mlp.compile(loss='categorical_crossentropy',
                optimizer=optimizers.Adam(lr=0.001),
                metrics=['accuracy'])
    clfs['MLP'] = mlp

    clfs['Voting'] = ensemble.get_VotingClassifier_ensemble_model(
        feature_names)
    boosting_clfs = ensemble.get_ada_boosting_clfs(feature_names)
    for key in boosting_clfs:
        clfs[key] = boosting_clfs[key]

    wrong_instances_clf, f1_records = cross_validation(clfs, X, y_true)
    return wrong_instances_clf, f1_records
Example #10
0
def test_max_samples_attribute():
    X = iris.data
    y = iris.target
    y = (y != 0)

    clf = SkopeRules(max_samples=1.).fit(X, y)
    assert_equal(clf.max_samples_, X.shape[0])

    clf = SkopeRules(max_samples=500)
    assert_warns_message(
        UserWarning, "max_samples will be set to n_samples for estimation",
        clf.fit, X, y)
    assert_equal(clf.max_samples_, X.shape[0])

    clf = SkopeRules(max_samples=0.4).fit(X, y)
    assert_equal(clf.max_samples_, 0.4 * X.shape[0])
Example #11
0
def get_VotingClassifier_ensemble_model(feature_names):
    from sklearn import ensemble
    from sklearn.ensemble import VotingClassifier
    from sklearn import tree
    from sklearn.naive_bayes import GaussianNB
    from sklearn.neighbors import KNeighborsClassifier
    from skrules import SkopeRules

    KNN = KNeighborsClassifier(n_neighbors=3)
    decision_tree = tree.DecisionTreeClassifier()
    NB=GaussianNB()
    RB=SkopeRules(max_depth_duplication=None,
                     n_estimators=30,
                     precision_min=0.6,
                     recall_min=0.01,
                     feature_names=feature_names)
    eclf1 = VotingClassifier(estimators=[('KNN', KNN), ('DT', decision_tree), ('NB', NB)], voting='hard')
    # model_1.fit(X_train,y_train)
    # model_2.fit(X_train,y_train)
    # model_3.fit(X_train,y_train)
    # model_4.fit(X_train,y_train)

    # pred1=model_1.predict(X_test)
    # pred2=model_2.predict(X_test)
    # pred3=model_3.predict(X_test)
    # pred4=model_4.predict(X_test)

    # final_pred = np.array([])
    # print("Ensemble model: Voting System")
    # for i in range(0,len(X_test)):
    #     final_pred = np.append(final_pred, mode([pred1[i], pred2[i], pred3[i],pred4[i]]))
    # print(final_pred)
    # return final_pred

    return eclf1
Example #12
0
def test_deduplication_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]]
    y = [0] * 6 + [1] * 2
    X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5],
              [5, -7]]
    # Test LOF
    clf = SkopeRules(random_state=rng, max_samples=1., max_depth_duplication=3)
    clf.fit(X, y)
    decision_func = clf.decision_function(X_test)
    rules_vote = clf.rules_vote(X_test)
    score_top_rules = clf.score_top_rules(X_test)
    pred = clf.predict(X_test)
    pred_score_top_rules = clf.predict_top_rules(X_test, 1)
Example #13
0
    def _getSkopeRules(X_train, y_train, model_params):
        # Rules
        print("Obtaining Rules using SkopeRules...")
        clf = SkopeRules(**model_params)
        clf.fit(X_train, y_train)
        rules = clf.rules_

        if len(rules) > 0:
            print("Checking inliers inside hypercubes...")
            df_rules = pd.DataFrame({
                "rule": [v[0].replace(" and ", " & ") for v in rules],
                "precision": [v[1][0] for v in rules],
                "recall": [v[1][1] for v in rules],
                "n_points_correct": [v[1][2] for v in rules],
            })
            if not df_rules.empty:
                df_rules["size_rules"] = df_rules.apply(
                    lambda x: len(x["rule"].split("&")), axis=1)
            else:
                df_rules["size_rules"] = 0
            rules = [v[0].replace(" and ", " & ") for v in rules]

            # Obtain rules in df format
            if len(rules) > 0:
                print("Turning rules to hypercubes...")
                df_rules_results = turn_rules_to_df(list_rules=rules,
                                                    list_cols=feature_cols)

                df_rules_pruned = simplifyRules(df_rules_results,
                                                categorical_cols)
                df_rules_pruned = df_rules_pruned.reset_index().merge(
                    df_rules.reset_index()[["index", "size_rules"]],
                    how="left")
                df_rules_pruned.index = df_rules_pruned["index"]
                df_rules_pruned = df_rules_pruned.drop(columns=["index"],
                                                       errors="ignore")
                df_rules_results = df_rules_pruned
            else:
                df_rules_results = pd.DataFrame()
            return df_rules_results
Example #14
0
def test_can_predict():
    clf = SkopeRules(regression=True,
                     max_depth_duplication=2,
                     n_estimators=30,
                     precision_min=0.20,
                     recall_min=0.20,
                     feature_names=feature_names)
    clf.fit(X, y)
    clf.predict(X)
Example #15
0
def test_skope_rules_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]]
    y = [0] * 6 + [1] * 2
    X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5],
              [5, -7]]
    # Test LOF
    clf = SkopeRules(random_state=rng, max_samples=1.)
    clf.fit(X, y)
    decision_func = clf.decision_function(X_test)
    rules_vote = clf.rules_vote(X_test)
    score_top_rules = clf.score_top_rules(X_test)
    pred = clf.predict(X_test)
    pred_score_top_rules = clf.predict_top_rules(X_test, 1)
    # assert detect outliers:
    assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
    assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2]))
    assert_greater(np.min(score_top_rules[-2:]), np.max(score_top_rules[:-2]))
    assert_array_equal(pred, 6 * [0] + 2 * [1])
    assert_array_equal(pred_score_top_rules, 6 * [0] + 2 * [1])
Example #16
0
def main():
    mail = get_feat_scores()  #panda table
    train, test = train_test_split(mail, test_size=0.3)  #split up data
    x_train = train.drop(columns=['label'])  #remove labels from test x
    y_train = train.drop(columns=['message', 'sf', 'hf'])
    cv = CountVectorizer(input='content',
                         stop_words=stp.words('english'),
                         ngram_range=(1, 2))
    x_tr = cv.fit_transform(
        x_train.message)  #vectorize x_train text for algorithm
    skr = SkopeRules(n_estimators=30, feature_names=['sf', 'hf'])  #algorithm
    y_train = y_train.to_numpy().ravel(
    )  #turn y_train into a 1d array for algorithm
    y_train = y_train.astype('int')
    skr.fit(x_tr.toarray(), y_train)
    #test data
    x_test = train.drop(columns=['label'])
    y_test = train.drop(columns=['message', 'sf', 'hf'])
    x_tst = cv.transform(x_test.message)
    y_test = y_test.to_numpy().ravel()
    y_test = y_test.astype('int')
    y_score = skr.score_top_rules(x_tst.toarray())
    #metrics
    recall_scr = recall_score(y_test, y_score, average='micro')
    f1_scr = f1_score(y_test, y_score, average='micro')
    pr_score = precision_score(y_test, y_score, average='micro')
    print("recall: " + str(recall_scr))
    print("f1: " + str(f1_scr))
    print("precision: " + str(pr_score))
    #plot
    precision, recall, r = precision_recall_curve(y_test, y_score)
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall curve')
    plt.show()
def KfoldAcc(X, y, multiclass=False, k=10):  #then oversample pos
    kf = KFold(n_splits=10, shuffle=True)
    accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  #test set

        neigh = KNeighborsClassifier()
        neigh.fit(X_train, y_train)
        neigh_y_pred = neigh.predict(X_test)
        accuracy['neigh'].append(
            accuracy_score(y_test,
                           neigh_y_pred,
                           normalize=True,
                           sample_weight=None))

        print('---------')
        tree = DecisionTreeClassifier()
        tree.fit(X_train, y_train)
        tree_y_pred = tree.predict(X_test)
        accuracy['tree'].append(
            accuracy_score(y_test,
                           tree_y_pred,
                           normalize=True,
                           sample_weight=None))

        print('---------')
        naive = GaussianNB()
        naive.fit(X_train, y_train)
        naive_y_pred = naive.predict(X_test)
        accuracy['naive'].append(
            accuracy_score(y_test,
                           naive_y_pred,
                           normalize=True,
                           sample_weight=None))

        print('---------')
        rule = SkopeRules()
        if multiclass is True:
            rule = OneVsRestClassifier(rule)
        rule.fit(X_train, y_train)
        rules_y_pred = rule.predict(X_test)
        accuracy['rule'].append(
            accuracy_score(y_test,
                           rules_y_pred,
                           normalize=True,
                           sample_weight=None))

    return accuracy
Example #18
0
    def extract_rules(bin_label: pd.Interval, features_df: pd.DataFrame,
                      class_encodings: pd.DataFrame,
                      objective_name: str) -> list:
        """
        Extract rules with given data and bin label.
        :param bin_label:
        :param features_df:
        :param class_encodings:
        :param objective_name:
        :return: List of extracted rules: (rule, precision, recall, support, result from, result to).
        """
        rules_clf: SkopeRules = SkopeRules(
            max_depth_duplication=None,
            n_estimators=30,
            precision_min=0.2,
            recall_min=0.01,
            feature_names=features_df.columns.values,
            n_jobs=1)
        rules_clf.fit(features_df.values,
                      class_encodings[objective_name] == bin_label)

        return [(rule[0], rule[1][0], rule[1][1], rule[1][2], bin_label.left,
                 bin_label.right) for rule in rules_clf.rules_]
Example #19
0
def BalanceSampleKfold(data, k=10):
    data_np = data.to_numpy()
    X = data_np[:, 0:18]
    y = data_np[:, 18]
    rus_balance = RandomUnderSampler(
        sampling_strategy=0.20)  #truncate neg to 5*#pos
    sm_balance = SMOTE()  #then oversample pos
    kf = KFold(n_splits=10, shuffle=True)

    accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  #test set
        print('num of pos:', np.sum(y_train), ', num of neg:',
              y_train.size - np.sum(y_train))
        X_bal, y_bal = rus_balance.fit_resample(X_train,
                                                y_train)  #BALANCED SAMPLE
        print('1.under:')
        print('num of pos:', np.sum(y_bal), ', num of neg:',
              y_bal.size - np.sum(y_bal))
        X_bal, y_bal = sm_balance.fit_resample(X_bal, y_bal)
        print('2.over:')
        print('num of pos:', np.sum(y_bal), ', num of neg:',
              y_bal.size - np.sum(y_bal))
        print('---------------------------------------')

        neigh = KNeighborsClassifier()
        neigh.fit(X_bal, y_bal)
        neigh_y_pred = neigh.predict(X_test)
        neigh_y_score = neigh.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['neigh'].append(
            recall_score(y_test, neigh_y_pred, labels=None, pos_label=1))
        accuracy['neigh'].append(
            accuracy_score(y_test,
                           neigh_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['neigh'].append(roc_auc_score(y_test, neigh_y_score))

        print('---------')
        tree = DecisionTreeClassifier()
        tree.fit(X_bal, y_bal)
        tree_y_pred = tree.predict(X_test)
        tree_y_score = tree.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['tree'].append(
            recall_score(y_test, tree_y_pred, labels=None, pos_label=1))
        accuracy['tree'].append(
            accuracy_score(y_test,
                           tree_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['tree'].append(roc_auc_score(y_test, tree_y_score))

        print('---------')
        naive = GaussianNB()
        naive.fit(X_bal, y_bal)
        naive_y_pred = naive.predict(X_test)
        naive_y_score = naive.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['naive'].append(
            recall_score(y_test, naive_y_pred, labels=None, pos_label=1))
        accuracy['naive'].append(
            accuracy_score(y_test,
                           naive_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['naive'].append(roc_auc_score(y_test, naive_y_score))

        print('---------')
        rule = SkopeRules(feature_names=data.columns.to_list()[0:18])
        rule.fit(X_bal, y_bal)
        rules_y_pred = rule.predict(X_test)
        rule_y_score = rule.rules_vote(X_test)
        recall['rule'].append(
            recall_score(y_test, rules_y_pred, labels=None, pos_label=1))
        accuracy['rule'].append(
            accuracy_score(y_test,
                           rules_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['rule'].append(roc_auc_score(y_test, rule_y_score))

    return accuracy, recall, auc
Example #20
0
def OverSampleKfold(data, k=10):
    data_np = data.to_numpy()
    X = data_np[:, 0:18]
    y = data_np[:, 18]
    sm = SMOTE()
    kf = KFold(n_splits=k, shuffle=True)
    accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  #test set
        print('num of pos:', np.sum(y_train), ', num of neg:',
              y_train.size - np.sum(y_train))

        X_over, y_over = sm.fit_resample(X_train,
                                         y_train)  #oversampled train set
        print('oversample:', X_over.shape, y_over.shape)
        print('---------------------------------------')

        neigh = KNeighborsClassifier()
        neigh.fit(X_over, y_over)
        neigh_y_pred = neigh.predict(X_test)
        neigh_y_score = neigh.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['neigh'].append(
            recall_score(y_test, neigh_y_pred, labels=None, pos_label=1))
        accuracy['neigh'].append(
            accuracy_score(y_test,
                           neigh_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['neigh'].append(roc_auc_score(y_test, neigh_y_score))

        print('---------')
        tree = DecisionTreeClassifier()
        tree.fit(X_over, y_over)
        tree_y_pred = tree.predict(X_test)
        tree_y_score = tree.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['tree'].append(
            recall_score(y_test, tree_y_pred, labels=None, pos_label=1))
        accuracy['tree'].append(
            accuracy_score(y_test,
                           tree_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['tree'].append(roc_auc_score(y_test, tree_y_score))

        print('---------')
        naive = GaussianNB()
        naive.fit(X_over, y_over)
        naive_y_pred = naive.predict(X_test)
        naive_y_score = naive.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['naive'].append(
            recall_score(y_test, naive_y_pred, labels=None, pos_label=1))
        accuracy['naive'].append(
            accuracy_score(y_test,
                           naive_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['naive'].append(roc_auc_score(y_test, naive_y_score))

        print('---------')
        rule = SkopeRules(feature_names=data.columns.to_list()[0:18])
        rule.fit(X_over, y_over)
        rules_y_pred = rule.predict(X_test)
        rule_y_score = rule.rules_vote(X_test)
        recall['rule'].append(
            recall_score(y_test, rules_y_pred, labels=None, pos_label=1))
        accuracy['rule'].append(
            accuracy_score(y_test,
                           rules_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['rule'].append(roc_auc_score(y_test, rule_y_score))

    return accuracy, recall, auc
Example #21
0
from sklearn.datasets import load_iris
from skrules import SkopeRules
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd

dataset = load_iris()
print(dataset)

feature_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

clf = SkopeRules(max_depth_duplication=2,
                 n_estimators=30,
                 precision_min=0.3,
                 recall_min=0.1,
                 feature_names=feature_names)

for idx, species in enumerate(dataset.target_names):
    X, y = dataset.data, dataset.target
    clf.fit(X, y == idx)
    rules = clf.rules_[0:3]
    print("Rules for iris", species)
    for rule in rules:
        print(rule)
    print()
    print(20*'=')
    print()
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):

        orig_cols = list(X.names)

        import pandas as pd
        import numpy as np
        from skrules import SkopeRules
        from sklearn.preprocessing import OneHotEncoder
        from collections import Counter

        # Get the logger if it exists
        logger = None
        if self.context and self.context.experiment_id:
            logger = make_experiment_logger(experiment_id=self.context.experiment_id,
                                            tmp_dir=self.context.tmp_dir,
                                            experiment_tmp_dir=self.context.experiment_tmp_dir)

        # Set up temp folder
        tmp_folder = self._create_tmp_folder(logger)

        # Set up model
        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)

            model = SkopeRules(max_depth_duplication=self.params["max_depth_duplication"],
                               n_estimators=self.params["n_estimators"],
                               precision_min=self.params["precision_min"],
                               recall_min=self.params["recall_min"],
                               max_samples=self.params["max_samples"],
                               max_samples_features=self.params["max_samples_features"],
                               max_depth=self.params["max_depth"],
                               max_features=self.params["max_features"],
                               min_samples_split=self.params["min_samples_split"],
                               bootstrap=self.params["bootstrap"],
                               bootstrap_features=self.params["bootstrap_features"],
                               random_state=self.params["random_state"],
                               feature_names=orig_cols)
        else:
            # Skopes doesn't work for regression
            loggerinfo(logger, "PASS, no skopes model")
            pass

        # Find the datatypes
        X = X.to_pandas()
        X.columns = orig_cols

        # Change continuous features to categorical
        X_datatypes = [str(item) for item in list(X.dtypes)]

        # Change all float32 values to float64
        for ii in range(len(X_datatypes)):
            if X_datatypes[ii] == 'float32':
                X = X.astype({orig_cols[ii]: np.float64})

        X_datatypes = [str(item) for item in list(X.dtypes)]

        # List the categorical and numerical features
        self.X_categorical = [orig_cols[col_count] for col_count in range(len(orig_cols)) if
                              (X_datatypes[col_count] == 'category') or (X_datatypes[col_count] == 'object')]
        self.X_numeric = [item for item in orig_cols if item not in self.X_categorical]

        # Find the levels and mode for each categorical feature
        # for use in the test set
        self.train_levels = {}
        for item in self.X_categorical:
            self.train_levels[item] = list(set(X[item]))
            self.train_mode[item] = Counter(X[item]).most_common(1)[0][0]

            # One hot encode the categorical features
        # And replace missing values with a Missing category
        if len(self.X_categorical) > 0:
            loggerinfo(logger, "PCategorical encode")

            for colname in self.X_categorical:
                X[colname] = list(X[colname].fillna("Missing"))
            self.enc = OneHotEncoder(handle_unknown='ignore')

            self.enc.fit(X[self.X_categorical])
            self.encoded_categories = list(self.enc.get_feature_names(input_features=self.X_categorical))

            X_enc = self.enc.transform(X[self.X_categorical]).toarray()

            X = pd.concat([X[self.X_numeric], pd.DataFrame(X_enc, columns=self.encoded_categories)], axis=1)

        # Replace missing values with a missing value code
        if len(self.X_numeric) > 0:

            for colname in self.X_numeric:
                X[colname] = list(X[colname].fillna(-999))

        model.fit(np.array(X), np.array(y))

        # Find the rule list
        self.rule_list = model.rules_

        # Calculate feature importances
        var_imp = []
        for var in orig_cols:
            var_imp.append(sum(int(var in item[0]) for item in self.rule_list))

        if max(var_imp) != 0:
            importances = list(np.array(var_imp) / max(var_imp))
        else:
            importances = [1] * len(var_imp)

        pd.DataFrame(model.rules_, columns=['Rule', '(Precision, Recall, nb)']).to_csv(
            os.path.join(tmp_folder, 'Skope_rules.csv'), index=False)

        self.mean_target = np.array(sum(y) / len(y))

        # Set model properties
        self.set_model_properties(model=model,
                                  features=list(X.columns),
                                  importances=importances,
                                  iterations=self.params['n_estimators'])
Example #23
0
class DecisionListClassifier(ClassifierMixin, ExplainerMixin):
    """ Decision List Classifier

    Currently a slight variant of SkopeRules from skope-rules.
    https://github.com/scikit-learn-contrib/skope-rules

    """

    available_explanations = ["global", "local"]
    explainer_type = "model"

    def __init__(self, feature_names=None, feature_types=None, **kwargs):
        """ Initializes skope rules.

        Args:
            **kwargs: Keyword arguments to be passed to SkopeRules
                in skope-rules.
        """
        self.feature_names = feature_names
        self.feature_types = feature_types
        self.kwargs = kwargs

    def fit(self, X, y):
        """ Fits model to provided instances.

        Args:
            X: Numpy array for training instances.
            y: Numpy array as training labels.

        Returns:
            Itself.
        """
        X, y, self.feature_names, self.feature_types = unify_data(
            X, y, self.feature_names, self.feature_types
        )
        self.feature_index_ = [
            "feature_" + str(i) for i, v in enumerate(self.feature_names)
        ]
        self.feature_map_ = {
            v: self.feature_names[i] for i, v in enumerate(self.feature_index_)
        }
        self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs)

        self.classes_, y = np.unique(y, return_inverse=True)
        self.sk_model_.fit(X, y)
        self.pos_ratio_ = np.mean(y)

        # Extract rules
        self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = self._extract_rules(
            self.sk_model_.rules_
        )

        self.global_selector = gen_global_selector(
            X, self.feature_names, self.feature_types, None
        )

        return self

    def predict(self, X):
        """ Predicts on provided instances.

        Args:
            X: Numpy array for instances.

        Returns:
            Predicted class label per instance.
        """

        X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types)
        scores = self.predict_proba(X)
        return self.classes_[np.argmax(scores, axis=1)]

    def _scores(self, X):
        df = pd.DataFrame(X, columns=self.feature_index_)
        selected_rules = self.internal_rules_

        scores = np.ones(X.shape[0]) * np.inf
        for k, r in enumerate(selected_rules):
            matched_idx = list(df.query(r[0]).index)
            scores[matched_idx] = np.minimum(k, scores[matched_idx])
        scores[np.isinf(scores)] = len(selected_rules)
        scores = scores.astype("int64")

        return scores

    def predict_proba(self, X):
        """ Provides probability estimates on provided instances.

        Args:
            X: Numpy array for instances.

        Returns:
            Probability estimate of instance for each class.
        """

        X, _, _, _ = unify_data(X, None, self.feature_names, self.feature_types)
        scores = self._scores(X)
        prec_ar = np.array(self.prec_)
        return np.c_[1.0 - prec_ar[scores], prec_ar[scores]]

    def _extract_rules(self, rules):
        rules = deepcopy(rules)
        rules = list(sorted(rules, key=lambda x: x[1][0], reverse=True))

        rule_li = []
        prec_li = []
        recall_li = []
        predict_li = []
        features_dict = {feat: [] for feat in self.feature_names}

        def extract_orig_features(pattern, rule):
            feature_set = set()
            for m in re.finditer(pattern, rule):
                orig_feature = self.feature_map_[m.group(1)]
                feature_set.add(orig_feature)
            return feature_set

        for indx, rule_rec in enumerate(rules):
            rule = rule_rec[0]
            rule_round = " ".join(
                [
                    "{0:.2f}".format(float(x)) if x.replace(".", "", 1).isdigit() else x
                    for x in rule.split(" ")
                ]
            )
            pattern = r"(feature_[0-9]+)"
            feature_set = extract_orig_features(pattern, rule_round)
            rule_fix = re.sub(
                pattern, lambda m: self.feature_map_[m.group(1)], rule_round
            )
            rule_li.append(rule_fix)
            prec_li.append(rule_rec[1][0])
            recall_li.append(rule_rec[1][1])
            predict_li.append(1.0)

            for feat in feature_set:
                features_dict[feat].append(indx)

        # Provide default rule
        rule_li.append("No Rules Triggered")
        prec_li.append(self.pos_ratio_)
        recall_li.append(1.0)
        predict_li.append(0.0)

        return rules, rule_li, prec_li, recall_li, features_dict

    def explain_local(self, X, y=None, name=None):
        if name is None:
            name = gen_name_from_class(self)

        X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types)

        scores = self._scores(X)
        outcomes = self.predict(X)
        prob_scores = self.predict_proba(X)

        data_dicts = []
        for idx, score in enumerate(scores):
            data_dict = {
                "type": "rule",
                "rule": [self.rules_[score]],
                "precision": [self.prec_[score]],
                "recall": [self.recall_[score]],
                "outcome": [outcomes[idx]],
            }
            data_dicts.append(data_dict)

        internal_obj = {"overall": None, "specific": data_dicts}
        selector = gen_local_selector(X, y, prob_scores[:, 1])

        return RulesExplanation(
            "local",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=selector,
        )

    def explain_global(self, name=None):
        """ Provides global explanation for model.

        Args:
            name: User-defined explanation name.

        Returns:
            An explanation object,
            visualizing feature-value pairs as horizontal bar chart.
        """
        if name is None:
            name = gen_name_from_class(self)

        # Extract rules
        rules, prec, recall, feat_rule_map = (
            self.rules_,
            self.prec_,
            self.recall_,
            self.feat_rule_map_,
        )

        outcomes = [self.classes_[1]] * (len(self.rules_) - 1)
        # Add the zero case for the default rule
        outcomes.append(self.classes_[0])
        overall_data_dict = {
            "type": "rule",
            "rule": rules,
            "precision": prec,
            "recall": recall,
            "outcome": outcomes,
        }
        data_dicts = [
            {
                "type": "rule",
                "rule": [rules[i] for i in feat_rule_map[feature]],
                "precision": [prec[i] for i in feat_rule_map[feature]],
                "recall": [recall[i] for i in feat_rule_map[feature]],
                "outcome": [outcomes[i] for i in feat_rule_map[feature]],
            }
            for feature in self.feature_names
        ]

        internal_obj = {"overall": overall_data_dict, "specific": data_dicts}

        return RulesExplanation(
            "global",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=self.global_selector,
        )
Example #24
0
class DecisionListClassifier(ClassifierMixin, ExplainerMixin):
    """ Decision List Classifier

    Currently a slight variant of SkopeRules from skope-rules.
    https://github.com/scikit-learn-contrib/skope-rules

    """
    available_explanations = ['global', 'local']
    explainer_type = 'model'

    def __init__(self, feature_names=None, feature_types=None, **kwargs):
        """ Initializes skope rules.

        Args:
            **kwargs: Keyword arguments to be passed to SkopeRules
                in skope-rules.
        """
        self.feature_names = feature_names
        self.feature_types = feature_types
        self.kwargs = kwargs

    def fit(self, X, y):
        """ Fits model to provided instances.

        Args:
            X: Numpy array for training instances.
            y: Numpy array as training labels.

        Returns:
            Itself.
        """
        X, y, self.feature_names, self.feature_types = unify_data(
            X, y, self.feature_names, self.feature_types)
        self.feature_index_ = [
            'feature_' + str(i) for i, v in enumerate(self.feature_names)
        ]
        self.feature_map_ = {
            v: self.feature_names[i]
            for i, v in enumerate(self.feature_index_)
        }
        self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs)

        self.classes_, y = np.unique(y, return_inverse=True)
        self.sk_model_.fit(X, y)
        self.pos_ratio_ = np.mean(y)

        # Extract rules
        self.internal_rules_, self.rules_, self.prec_, self.recall_, self.feat_rule_map_ = \
            self._extract_rules(self.sk_model_.rules_)

        self.global_selector = gen_global_selector(X, self.feature_names,
                                                   self.feature_types, None)

        return self

    def predict(self, X):
        """ Predicts on provided instances.

        Args:
            X: Numpy array for instances.

        Returns:
            Predicted class label per instance.
        """

        X, _, _, _ = unify_data(X, None, self.feature_names,
                                self.feature_types)
        scores = self.predict_proba(X)
        return self.classes_[np.argmax(scores, axis=1)]

    def _scores(self, X):
        df = pd.DataFrame(X, columns=self.feature_index_)
        selected_rules = self.internal_rules_

        scores = np.ones(X.shape[0]) * np.inf
        for k, r in enumerate(selected_rules):
            matched_idx = list(df.query(r[0]).index)
            scores[matched_idx] = np.minimum(k, scores[matched_idx])
        scores[np.isinf(scores)] = len(selected_rules)
        scores = scores.astype('int64')

        return scores

    def predict_proba(self, X):
        """ Provides probability estimates on provided instances.

        Args:
            X: Numpy array for instances.

        Returns:
            Probability estimate of instance for each class.
        """

        X, _, _, _ = unify_data(X, None, self.feature_names,
                                self.feature_types)
        scores = self._scores(X)
        prec_ar = np.array(self.prec_)
        return np.c_[1.0 - prec_ar[scores], prec_ar[scores]]

    def _extract_rules(self, rules):
        rules = deepcopy(rules)
        rules = list(sorted(rules, key=lambda x: x[1][0], reverse=True))

        rule_li = []
        prec_li = []
        recall_li = []
        predict_li = []
        features_dict = {feat: [] for feat in self.feature_names}

        def extract_orig_features(pattern, rule):
            feature_set = set()
            for m in re.finditer(pattern, rule):
                orig_feature = self.feature_map_[m.group(1)]
                feature_set.add(orig_feature)
            return feature_set

        for indx, rule_rec in enumerate(rules):
            rule = rule_rec[0]
            rule_round = ' '.join([
                '{0:.2f}'.format(float(x))
                if x.replace('.', '', 1).isdigit() else x
                for x in rule.split(' ')
            ])
            pattern = r'(feature_[0-9]+)'
            feature_set = extract_orig_features(pattern, rule_round)
            rule_fix = re.sub(pattern, lambda m: self.feature_map_[m.group(1)],
                              rule_round)
            rule_li.append(rule_fix)
            prec_li.append(rule_rec[1][0])
            recall_li.append(rule_rec[1][1])
            predict_li.append(1.0)

            for feat in feature_set:
                features_dict[feat].append(indx)

        # Provide default rule
        rule_li.append('No Rules Triggered')
        prec_li.append(self.pos_ratio_)
        recall_li.append(1.0)
        predict_li.append(0.0)

        return rules, rule_li, prec_li, recall_li, features_dict

    def explain_local(self, X, y=None, name=None):
        if name is None:
            name = gen_name_from_class(self)

        X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types)

        scores = self._scores(X)
        outcomes = self.predict(X)
        prob_scores = self.predict_proba(X)

        data_dicts = []
        for idx, score in enumerate(scores):
            data_dict = {
                'type': 'rule',
                'rule': [self.rules_[score]],
                'precision': [self.prec_[score]],
                'recall': [self.recall_[score]],
                'outcome': [outcomes[idx]],
            }
            data_dicts.append(data_dict)

        internal_obj = {
            'overall': None,
            'specific': data_dicts,
        }
        selector = gen_local_selector(X, y, prob_scores[:, 1])

        return RulesExplanation('local',
                                internal_obj,
                                feature_names=self.feature_names,
                                feature_types=self.feature_types,
                                name=name,
                                selector=selector)

    def explain_global(self, name=None):
        """ Provides global explanation for model.

        Args:
            name: User-defined explanation name.

        Returns:
            An explanation object,
            visualizing feature-value pairs as horizontal bar chart.
        """
        if name is None:
            name = gen_name_from_class(self)

        # Extract rules
        rules, prec, recall, feat_rule_map = \
            self.rules_, self.prec_, self.recall_, self.feat_rule_map_

        outcomes = [self.classes_[1]] * (len(self.rules_) - 1)
        # Add the zero case for the default rule
        outcomes.append(self.classes_[0])
        overall_data_dict = {
            'type': 'rule',
            'rule': rules,
            'precision': prec,
            'recall': recall,
            'outcome': outcomes
        }
        data_dicts = [{
            'type': 'rule',
            'rule': [rules[i] for i in feat_rule_map[feature]],
            'precision': [prec[i] for i in feat_rule_map[feature]],
            'recall': [recall[i] for i in feat_rule_map[feature]],
            'outcome': [outcomes[i] for i in feat_rule_map[feature]],
        } for feature in self.feature_names]

        internal_obj = {
            'overall': overall_data_dict,
            'specific': data_dicts,
        }

        return RulesExplanation('global',
                                internal_obj,
                                feature_names=self.feature_names,
                                feature_types=self.feature_types,
                                name=name,
                                selector=self.global_selector)
                       batch_size=1000)
    return neural_net


# Train the network, then make predictions on the test set and print the results
neural_network = compute_semi_supervised_learning(neural_network, X_train,
                                                  Y_train)
neural_network_pred = np.array(neural_network.predict_classes(
    np.array(X_test)))
print_statistics("Semi-Supervised Neural Network", neural_network_pred, Y_test)

# ************* Rule Model:  ************************
# Here we compare 3 nearest neighbour models on the validation set
# First skope rules model
rule_clf1 = SkopeRules(n_estimators=50,
                       precision_min=0.2,
                       recall_min=0.2,
                       feature_names=feature_names)
rule_clf1.fit(X_val, Y_val)
rule_clf1_ypred = rule_clf1.predict(X_test_val)
print_statistics("Rule Classifier - Skope Rules - Model 1", rule_clf1_ypred,
                 Y_test_val)
# Second skope rules model
rule_clf2 = SkopeRules(n_estimators=50,
                       precision_min=0.2,
                       recall_min=0.2,
                       feature_names=feature_names)
rule_clf2.fit(X_val, Y_val)
rule_clf2_ypred = rule_clf2.predict(X_test_val)
print_statistics("Rule Classifier - Skope Rules - Model 2", rule_clf2_ypred,
                 Y_test_val)
# Third skope rules model
Example #26
0
    def train(self, feature_names, symbol_vars):
        #model = xgboost.XGBClassifier(max_depth=7, n_estimators=10)
        #class_w=class_weight.compute_class_weight("balanced",np.unique(y),y)
        self.pddata['Y'] = (self.pddata['Y'] == self.args.label)
        self.pddata.to_csv(
            os.path.join(
                self.outdir, self.outname +
                datetime.datetime.now().strftime("%Y_%m_%d_%H_%M.csv")))
        traindata = self.pddata.sample(frac=0.8, replace=True)
        traindata = traindata.reset_index(drop=1)
        sample_weight = class_weight.compute_sample_weight(
            "balanced", traindata['Y'])
        X = traindata.iloc[:, 1:].to_numpy()
        y = traindata['Y']
        self.sample_weight = sample_weight
        data = xgboost.DMatrix(data=X,
                               label=y,
                               feature_names=feature_names,
                               feature_types=['int'] * X.shape[-1],
                               weight=sample_weight)
        self.feature_names = feature_names
        d = X.shape[-1]
        feature_combination = []
        for sym in symbol_vars:
            print(sym)
            if len(symbol_vars[sym]) > 0:
                feature_combination.append(symbol_vars[sym])
        import pickle
        if self.in_param_file:
            with open(self.in_param_file) as f:
                params = pickle.load(f)
                model = xgboost.train(
                    params=params,
                    dtrain=data,
                    num_boost_round=self.args.ntrees,
                )

        else:
            t_clf = self.tune()
            model = t_clf.best_estimator_._Booster
            params = t_clf.best_params_
            #params['rate_drop']=0.1
            #params['skip_drop']=0.5
            #params['normalize_type']='tree'
        with open(self.param_file, 'wb') as f:
            pickle.dump(params, f)
        print(self.linear)
        if self.args.debug:
            embed()
        model.dump_model(self.modelfile, with_stats=True)
        clf = SkopeRules(max_depth_duplication=self.args.depth,
                         precision_min=0.6,
                         recall_min=0.005,
                         verbose=1,
                         feature_names=feature_names)
        evaldata = self.pddata.sample(frac=0.3, replace=True)
        evaldata = evaldata.reset_index(drop=1)
        eval_sample_weight = class_weight.compute_sample_weight(
            "balanced", evaldata['Y'])
        clf.fit_xgbmodel(evaldata, model, eval_sample_weight)
        print("end fit_xgbmodel")
        clf.rules_.sort(key=lambda x: x[1], reverse=True)
        rules = {}
        for i in range(len(clf.rules_)):
            r = trim_rule(clf.rules_[i], evaldata, eval_sample_weight)
            rules[r[0]] = r[1]
        rulelist = []
        for r in rules:
            rulelist.append([r, rules[r]])
        rulelist.sort(key=lambda x: x[1], reverse=True)
        usedLinear = {}
        toLatex(rulelist, self.rule_latex)
        for lname in self.linear:
            if any(lname in r[0] for r in rulelist):
                usedLinear[lname] = self.linear[lname]
                print("%s=%s" % (lname, usedLinear[lname][0]))

        sym_vars = symbol_vars
        var_sizes = [
            len(sym_vars['c']),
            len(sym_vars['I']),
            len(sym_vars['Ialt']),
            len(sym_vars['s']),
            len(sym_vars['salt'])
        ]
        allr1, allr = simplify_rules(clf.rules_)
        #cnf=tocnffile(var_sizes,allr1,self.cnffile)
        allrscore = xgbtree_rule_perf(str(allr1), evaldata, evaldata['Y'],
                                      eval_sample_weight)
        print("all r=", simplify(~allr), allrscore)
        self.saverules(clf.rules_, [simplify(allr), allrscore], self.rulefile)
        if self.args.debug:
            embed()
# a number of rules, each seeking for high precision on a potentially small
# area of detection (low recall).


###############################################################################
# Getting rules with skrules
# ..................
#
# This part shows how SkopeRules can be fitted to detect credit defaults.
# Performances are compared with the random forest model previously trained.

# fit the model

clf = SkopeRules(
    similarity_thres=.9, max_depth=3, max_features=0.5,
    max_samples_features=0.5, random_state=rng, n_estimators=30,
    feature_names=feature_names, recall_min=0.02, precision_min=0.6
    )
clf.fit(X_train, y_train)

# in the separate_rules_score method, a score of k means that rule number k
# vote positively, but not rules 1, ..., k-1. It will allow us to plot
# performance of each rule separately on ROC and PR plots.
scoring = clf.separate_rules_score(X_test)

print(str(len(clf.rules_)) + ' rules have been built.')
print('The most precise rules are the following:')
print(clf.rules_[:5])

curves = [roc_curve, precision_recall_curve]
xlabels = ['False Positive Rate', 'Recall (True Positive Rate)']
Example #28
0
# area of detection (low recall).

###############################################################################
# Getting rules with skrules
# ..........................
#
# This part shows how SkopeRules can be fitted to detect credit defaults.
# Performances are compared with the random forest model previously trained.

# fit the model

clf = SkopeRules(max_depth_duplication=3,
                 max_depth=3,
                 max_features=0.5,
                 max_samples_features=0.5,
                 random_state=rng,
                 n_estimators=20,
                 feature_names=feature_names,
                 recall_min=0.04,
                 precision_min=0.6)
clf.fit(X_train, y_train)

# in the score_top_rules method, a score of k means that rule number k
# vote positively, but not rules 1, ..., k-1. It will allow us to plot
# performance of each rule separately on the ROC and PR plots.
scoring = clf.score_top_rules(X_test)

print(str(len(clf.rules_)) + ' rules have been built.')
print('The 5 most precise rules are the following:')
for rule in clf.rules_[:5]:
    print(rule[0])
Example #29
0
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)
X1 = feat_selector.fit_transform(X_bal, y_bal)

from sklearn.feature_selection import SelectKBest, f_classif
X2 = SelectKBest(f_classif, k=20).fit_transform(X_bal, y_bal)


#no feature sel
for train_index, test_index in kf.split(X_bal):
    X_train, X_test = X_bal[train_index], X_bal[test_index]
    y_train, y_test = y_bal[train_index], y_bal[test_index]    #test set
    
    neigh = KNeighborsClassifier()  ##
    tree = DecisionTreeClassifier()
    naive = GaussianNB()
    rule = SkopeRules(feature_names= data.columns.to_list()[0:18])
    
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    accuracy_no['neigh'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
    
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    accuracy_no['tree'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
    
    naive.fit(X_train, y_train)
    y_pred = naive.predict(X_test)
    accuracy_no['naive'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
    
    rule.fit(X_train, y_train)
    y_pred = rule.predict(X_test)
Example #30
0
def Sample2c(data, model):
    #use sampling method to rebalance data and train each model
    #model in ['SVC', 'tree', 'kNN', 'rule', 'RF', 'Adaboost']
    from sklearn.model_selection import KFold

    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler

    from sklearn.metrics import accuracy_score
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import precision_recall_fscore_support

    from sklearn.svm import SVC
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from skrules import SkopeRules
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.multiclass import OneVsRestClassifier

    if model not in ['SVC', 'tree', 'kNN', 'rule', 'RF', 'Adaboost']:
        print('model not support')
        return

    X, y = npData2c(data)
    acc_5c = []  #store average result(10 folds)
    confusion_mat_5c = []
    precision_5c = []
    recall_5c = []
    fscore_5c = []

    precision_all = []  #store raw score for class 4 and 5(1o for each model)
    recall_all = []
    fscore_all = []

    kf = KFold(n_splits=10, shuffle=True)

    #balanced sampling, oversample 4 and 5
    a = 0
    p, r, f = np.zeros([2]), np.zeros([2]), np.zeros([2])
    c = np.zeros([2, 2])
    P = np.zeros([10, 2])
    R = np.zeros([10, 2])
    F = np.zeros([10, 2])

    rus_balance = RandomUnderSampler(sampling_strategy={0: 30000})
    sm_balance = SMOTE(sampling_strategy={1: 15000})
    i = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train, y_train = rus_balance.fit_resample(X_train, y_train)
        X_train, y_train = sm_balance.fit_resample(X_train, y_train)

        if model == 'SVC':
            clf = SVC(kernel='rbf', gamma='scale')  ####################
        elif model == 'tree':
            clf = DecisionTreeClassifier()  ####################
        elif model == 'kNN':
            clf = KNeighborsClassifier()  ####################
        elif model == 'rule':
            rule = SkopeRules()
            clf = OneVsRestClassifier(rule)

        elif model == 'RF':
            clf = RandomForestClassifier(n_estimators=100,
                                         max_depth=2,
                                         random_state=0)
        elif model == 'Adaboost':
            clf = AdaBoostClassifier(n_estimators=100, random_state=0)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        a += accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
        c += confusion_matrix(y_test, y_pred)
        prf = precision_recall_fscore_support(y_test, y_pred)
        p += prf[0]
        r += prf[1]
        f += prf[2]
        P[i][0], P[i][1] = prf[0][0], prf[0][1]
        R[i][0], R[i][1] = prf[1][0], prf[1][1]
        F[i][0], F[i][1] = prf[2][0], prf[2][1]
        i += 1
        print(
            accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
        print(confusion_matrix(y_test, y_pred))
        print(precision_recall_fscore_support(y_test, y_pred))

    acc_5c.append(a / 10)
    confusion_mat_5c.append(c / 10)
    precision_5c.append(p / 10)
    recall_5c.append(r / 10)
    fscore_5c.append(f / 10)
    precision_all.append(P)
    recall_all.append(R)
    fscore_all.append(F)

    return (acc_5c, confusion_mat_5c, precision_5c, recall_5c, fscore_5c,
            [precision_all, recall_all, fscore_all])
Example #31
0
class DecisionListClassifier(ClassifierMixin,
                             ExplainerMixin):  # pragma: no cover
    """ Decision List Classifier

    Currently a slight variant of SkopeRules from skope-rules.
    https://github.com/scikit-learn-contrib/skope-rules

    """

    available_explanations = ["global", "local"]
    explainer_type = "model"

    def __init__(self, feature_names=None, feature_types=None, **kwargs):
        """ Initializes class.

        Args:
            feature_names: List of feature names.
            feature_types: List of feature types.
            **kwargs: Kwargs passed to wrapped SkopeRules at initialization time.
        """
        self.feature_names = feature_names
        self.feature_types = feature_types
        self.kwargs = kwargs

    def fit(self, X, y):
        """ Fits model to provided instances.

        Args:
            X: Numpy array for training instances.
            y: Numpy array as training labels.

        Returns:
            Itself.
        """
        from skrules import SkopeRules as SR

        X, y, self.feature_names, self.feature_types = unify_data(
            X, y, self.feature_names, self.feature_types)
        self.feature_index_ = [
            "feature_" + str(i) for i, v in enumerate(self.feature_names)
        ]
        self.feature_map_ = {
            v: self.feature_names[i]
            for i, v in enumerate(self.feature_index_)
        }
        self.sk_model_ = SR(feature_names=self.feature_index_, **self.kwargs)

        self.classes_, y = np.unique(y, return_inverse=True)
        self.sk_model_.fit(X, y)
        self.pos_ratio_ = np.mean(y)

        # Extract rules
        (
            self.internal_rules_,
            self.rules_,
            self.prec_,
            self.recall_,
            self.feat_rule_map_,
        ) = self._extract_rules(self.sk_model_.rules_)

        self.global_selector = gen_global_selector(X, self.feature_names,
                                                   self.feature_types, None)

        return self

    def predict(self, X):
        """ Predicts on provided instances.

        Args:
            X: Numpy array for instances.

        Returns:
            Predicted class label per instance.
        """

        X, _, _, _ = unify_data(X, None, self.feature_names,
                                self.feature_types)
        scores = self.predict_proba(X)
        return self.classes_[np.argmax(scores, axis=1)]

    def _scores(self, X):
        df = pd.DataFrame(X, columns=self.feature_index_)
        selected_rules = self.internal_rules_

        scores = np.ones(X.shape[0]) * np.inf
        for k, r in enumerate(selected_rules):
            matched_idx = list(df.query(r[0]).index)
            scores[matched_idx] = np.minimum(k, scores[matched_idx])
        scores[np.isinf(scores)] = len(selected_rules)
        scores = scores.astype("int64")

        return scores

    def predict_proba(self, X):
        """ Provides probability estimates on provided instances.

        Args:
            X: Numpy array for instances.

        Returns:
            Probability estimate of instance for each class.
        """

        X, _, _, _ = unify_data(X, None, self.feature_names,
                                self.feature_types)
        scores = self._scores(X)
        prec_ar = np.array(self.prec_)
        return np.c_[1.0 - prec_ar[scores], prec_ar[scores]]

    def _extract_rules(self, rules):
        rules = deepcopy(rules)
        rules = list(sorted(rules, key=lambda x: x[1][0], reverse=True))

        rule_li = []
        prec_li = []
        recall_li = []
        predict_li = []
        features_dict = {feat: [] for feat in self.feature_names}

        def extract_orig_features(pattern, rule):
            feature_set = set()
            for m in re.finditer(pattern, rule):
                orig_feature = self.feature_map_[m.group(1)]
                feature_set.add(orig_feature)
            return feature_set

        for indx, rule_rec in enumerate(rules):
            rule = rule_rec[0]
            rule_round = " ".join([
                "{0:.2f}".format(float(x))
                if x.replace(".", "", 1).isdigit() else x
                for x in rule.split(" ")
            ])
            pattern = r"(feature_[0-9]+)"
            feature_set = extract_orig_features(pattern, rule_round)
            rule_fix = re.sub(pattern, lambda m: self.feature_map_[m.group(1)],
                              rule_round)
            rule_li.append(rule_fix)
            prec_li.append(rule_rec[1][0])
            recall_li.append(rule_rec[1][1])
            predict_li.append(1.0)

            for feat in feature_set:
                features_dict[feat].append(indx)

        # Provide default rule
        rule_li.append("No Rules Triggered")
        prec_li.append(self.pos_ratio_)
        recall_li.append(1.0)
        predict_li.append(0.0)

        return rules, rule_li, prec_li, recall_li, features_dict

    def explain_local(self, X, y=None, name=None):
        """ Provides local explanations for provided instances.

        Args:
            X: Numpy array for X to explain.
            y: Numpy vector for y to explain.
            name: User-defined explanation name.

        Returns:
            An explanation object.
        """
        if name is None:
            name = gen_name_from_class(self)

        X, y, _, _ = unify_data(X, y, self.feature_names, self.feature_types)

        scores = self._scores(X)
        outcomes = self.predict(X)
        prob_scores = self.predict_proba(X)

        data_dicts = []
        for idx, score in enumerate(scores):
            data_dict = {
                "type": "rule",
                "rule": [self.rules_[score]],
                "precision": [self.prec_[score]],
                "recall": [self.recall_[score]],
                "outcome": [outcomes[idx]],
            }
            data_dicts.append(data_dict)

        internal_obj = {"overall": None, "specific": data_dicts}
        selector = gen_local_selector(data_dicts, is_classification=True)

        return RulesExplanation(
            "local",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=selector,
        )

    def explain_global(self, name=None):
        """ Provides global explanation for model.

        Args:
            name: User-defined explanation name.

        Returns:
            An explanation object.
        """
        if name is None:
            name = gen_name_from_class(self)

        # Extract rules
        rules, prec, recall, feat_rule_map = (
            self.rules_,
            self.prec_,
            self.recall_,
            self.feat_rule_map_,
        )

        outcomes = [self.classes_[1]] * (len(self.rules_) - 1)
        # Add the zero case for the default rule
        outcomes.append(self.classes_[0])
        overall_data_dict = {
            "type": "rule",
            "rule": rules,
            "precision": prec,
            "recall": recall,
            "outcome": outcomes,
        }
        data_dicts = [{
            "type": "rule",
            "rule": [rules[i] for i in feat_rule_map[feature]],
            "precision": [prec[i] for i in feat_rule_map[feature]],
            "recall": [recall[i] for i in feat_rule_map[feature]],
            "outcome": [outcomes[i] for i in feat_rule_map[feature]],
        } for feature in self.feature_names]

        internal_obj = {"overall": overall_data_dict, "specific": data_dicts}

        return RulesExplanation(
            "global",
            internal_obj,
            feature_names=self.feature_names,
            feature_types=self.feature_types,
            name=name,
            selector=self.global_selector,
        )
Example #32
0
def Sample5c(X, y, model):
    #use sampling method to rebalance data and train each model
    #model in ['SVC', 'tree', 'kNN', 'rule', 'RF', 'Adaboost']

    if model not in ['SVC', 'tree', 'kNN', 'rule', 'RF', 'Adaboost']:
        print('model not support')
        return

    acc_5c = []  #store average result(10 folds)
    confusion_mat_5c = []
    precision_5c = []
    recall_5c = []
    fscore_5c = []

    precision_all = []  #store raw score for class 4 and 5(1o for each model)
    recall_all = []
    fscore_all = []

    kf = KFold(n_splits=10, shuffle=True)

    # only down sample majority class(1,2,3,4,)
    a = 0
    p, r, f = np.zeros([5]), np.zeros([5]), np.zeros([5])
    c = np.zeros([5, 5])
    P = np.zeros([10, 2])
    R = np.zeros([10, 2])
    F = np.zeros([10, 2])
    rus_balance = RandomUnderSampler(sampling_strategy={
        1: 2500,
        2: 2500,
        3: 2500,
        4: 2500
    })

    i = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train, y_train = rus_balance.fit_resample(X_train, y_train)

        if model == 'SVC':
            clf = SVC(kernel='rbf', gamma='scale')  ####################
        elif model == 'tree':
            clf = DecisionTreeClassifier()  ####################
        elif model == 'kNN':
            clf = KNeighborsClassifier()  ####################
        elif model == 'rule':
            rule = SkopeRules()
            clf = OneVsRestClassifier(rule)

        elif model == 'RF':
            clf = RandomForestClassifier(n_estimators=100,
                                         max_depth=2,
                                         random_state=0)
        elif model == 'Adaboost':
            clf = AdaBoostClassifier(n_estimators=100, random_state=0)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        a += accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
        c += confusion_matrix(y_test, y_pred)
        prf = precision_recall_fscore_support(y_test, y_pred)
        p += prf[0]
        r += prf[1]
        f += prf[2]
        P[i][0], P[i][1] = prf[0][3], prf[0][4]
        R[i][0], R[i][1] = prf[1][3], prf[1][4]
        F[i][0], F[i][1] = prf[2][3], prf[2][4]
        i += 1
        print(
            accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
        print(confusion_matrix(y_test, y_pred))
        print(precision_recall_fscore_support(y_test, y_pred))

    acc_5c.append(a / 10)
    confusion_mat_5c.append(c / 10)
    precision_5c.append(p / 10)
    recall_5c.append(r / 10)
    fscore_5c.append(f / 10)
    precision_all.append(P)
    recall_all.append(R)
    fscore_all.append(F)
    ##################

    #balanced sampling, oversample 4 and 5
    a = 0
    p, r, f = np.zeros([5]), np.zeros([5]), np.zeros([5])
    c = np.zeros([5, 5])
    P = np.zeros([10, 2])
    R = np.zeros([10, 2])
    F = np.zeros([10, 2])

    rus_balance = RandomUnderSampler(sampling_strategy={
        1: 10000,
        2: 10000,
        3: 10000
    })
    sm_balance = SMOTE(sampling_strategy={4: 10000, 5: 5000})
    i = 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        X_train, y_train = rus_balance.fit_resample(X_train, y_train)
        X_train, y_train = sm_balance.fit_resample(X_train, y_train)

        if model == 'SVC':
            clf = SVC(kernel='rbf', gamma='scale')  ####################
        elif model == 'tree':
            clf = DecisionTreeClassifier()  ####################
        elif model == 'kNN':
            clf = KNeighborsClassifier()  ####################
        elif model == 'rule':
            rule = SkopeRules()
            clf = OneVsRestClassifier(rule)

        elif model == 'RF':
            clf = RandomForestClassifier(n_estimators=100,
                                         max_depth=2,
                                         random_state=0)
        elif model == 'Adaboost':
            clf = AdaBoostClassifier(n_estimators=100, random_state=0)

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        a += accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
        c += confusion_matrix(y_test, y_pred)
        prf = precision_recall_fscore_support(y_test, y_pred)
        p += prf[0]
        r += prf[1]
        f += prf[2]
        P[i][0], P[i][1] = prf[0][3], prf[0][4]
        R[i][0], R[i][1] = prf[1][3], prf[1][4]
        F[i][0], F[i][1] = prf[2][3], prf[2][4]
        i += 1
        print(
            accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
        print(confusion_matrix(y_test, y_pred))
        print(precision_recall_fscore_support(y_test, y_pred))

    acc_5c.append(a / 10)
    confusion_mat_5c.append(c / 10)
    precision_5c.append(p / 10)
    recall_5c.append(r / 10)
    fscore_5c.append(f / 10)
    precision_all.append(P)
    recall_all.append(R)
    fscore_all.append(F)

    return (acc_5c, confusion_mat_5c, precision_5c, recall_5c, fscore_5c,
            [precision_all, recall_all, fscore_all])