Beispiel #1
0
def test_deduplication_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]]
    y = [0] * 6 + [1] * 2
    X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5],
              [5, -7]]
    # Test LOF
    clf = SkopeRules(random_state=rng, max_samples=1., max_depth_duplication=3)
    clf.fit(X, y)
    decision_func = clf.decision_function(X_test)
    rules_vote = clf.rules_vote(X_test)
    score_top_rules = clf.score_top_rules(X_test)
    pred = clf.predict(X_test)
    pred_score_top_rules = clf.predict_top_rules(X_test, 1)
Beispiel #2
0
def test_skope_rules_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]]
    y = [0] * 6 + [1] * 2
    X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5],
              [5, -7]]
    # Test LOF
    clf = SkopeRules(random_state=rng, max_samples=1.)
    clf.fit(X, y)
    decision_func = clf.decision_function(X_test)
    rules_vote = clf.rules_vote(X_test)
    score_top_rules = clf.score_top_rules(X_test)
    pred = clf.predict(X_test)
    pred_score_top_rules = clf.predict_top_rules(X_test, 1)
    # assert detect outliers:
    assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
    assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2]))
    assert_greater(np.min(score_top_rules[-2:]), np.max(score_top_rules[:-2]))
    assert_array_equal(pred, 6 * [0] + 2 * [1])
    assert_array_equal(pred_score_top_rules, 6 * [0] + 2 * [1])
Beispiel #3
0
def main():
    mail = get_feat_scores()  #panda table
    train, test = train_test_split(mail, test_size=0.3)  #split up data
    x_train = train.drop(columns=['label'])  #remove labels from test x
    y_train = train.drop(columns=['message', 'sf', 'hf'])
    cv = CountVectorizer(input='content',
                         stop_words=stp.words('english'),
                         ngram_range=(1, 2))
    x_tr = cv.fit_transform(
        x_train.message)  #vectorize x_train text for algorithm
    skr = SkopeRules(n_estimators=30, feature_names=['sf', 'hf'])  #algorithm
    y_train = y_train.to_numpy().ravel(
    )  #turn y_train into a 1d array for algorithm
    y_train = y_train.astype('int')
    skr.fit(x_tr.toarray(), y_train)
    #test data
    x_test = train.drop(columns=['label'])
    y_test = train.drop(columns=['message', 'sf', 'hf'])
    x_tst = cv.transform(x_test.message)
    y_test = y_test.to_numpy().ravel()
    y_test = y_test.astype('int')
    y_score = skr.score_top_rules(x_tst.toarray())
    #metrics
    recall_scr = recall_score(y_test, y_score, average='micro')
    f1_scr = f1_score(y_test, y_score, average='micro')
    pr_score = precision_score(y_test, y_score, average='micro')
    print("recall: " + str(recall_scr))
    print("f1: " + str(f1_scr))
    print("precision: " + str(pr_score))
    #plot
    precision, recall, r = precision_recall_curve(y_test, y_score)
    plt.plot(recall, precision)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision Recall curve')
    plt.show()
Beispiel #4
0
clf = SkopeRules(max_depth_duplication=3,
                 max_depth=3,
                 max_features=0.5,
                 max_samples_features=0.5,
                 random_state=rng,
                 n_estimators=20,
                 feature_names=feature_names,
                 recall_min=0.04,
                 precision_min=0.6)
clf.fit(X_train, y_train)

# in the score_top_rules method, a score of k means that rule number k
# vote positively, but not rules 1, ..., k-1. It will allow us to plot
# performance of each rule separately on the ROC and PR plots.
scoring = clf.score_top_rules(X_test)

print(str(len(clf.rules_)) + ' rules have been built.')
print('The 5 most precise rules are the following:')
for rule in clf.rules_[:5]:
    print(rule[0])

curves = [roc_curve, precision_recall_curve]
xlabels = ['False Positive Rate', 'Recall (True Positive Rate)']
ylabels = ['True Positive Rate (Recall)', 'Precision']

fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True)

ax = axes[0]
fpr, tpr, _ = roc_curve(y_test, scoring)
fpr_rf, tpr_rf, _ = roc_curve(y_test, scoring_rf)
from sklearn.datasets import load_boston
from sklearn.metrics import precision_recall_curve
from matplotlib import pyplot as plt
from skrules import SkopeRules
import seaborn as sns

dataset = load_boston()
clf = SkopeRules(max_depth_duplication=None,
                 n_estimators=30,
                 precision_min=0.2,
                 recall_min=0.01,
                 feature_names=dataset.feature_names)

X, y = dataset.data, dataset.target > 25
X_train, y_train = X[:len(y)//2], y[:len(y)//2]
X_test, y_test = X[len(y)//2:], y[len(y)//2:]
clf.fit(X_train, y_train)
y_score = clf.score_top_rules(X_test)
precision, recall, _ = precision_recall_curve(y_test, y_score)
print(recall)
print("Precision",precision)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall curve')
plt.show()
ax=sns.barplot(recall,precision)
ax.set(xlabel ='Recall', ylabel ='Precision')
plt.show()