def test_deduplication_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF clf = SkopeRules(random_state=rng, max_samples=1., max_depth_duplication=3) clf.fit(X, y) decision_func = clf.decision_function(X_test) rules_vote = clf.rules_vote(X_test) score_top_rules = clf.score_top_rules(X_test) pred = clf.predict(X_test) pred_score_top_rules = clf.predict_top_rules(X_test, 1)
def test_skope_rules_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF clf = SkopeRules(random_state=rng, max_samples=1.) clf.fit(X, y) decision_func = clf.decision_function(X_test) rules_vote = clf.rules_vote(X_test) score_top_rules = clf.score_top_rules(X_test) pred = clf.predict(X_test) pred_score_top_rules = clf.predict_top_rules(X_test, 1) # assert detect outliers: assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2])) assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2])) assert_greater(np.min(score_top_rules[-2:]), np.max(score_top_rules[:-2])) assert_array_equal(pred, 6 * [0] + 2 * [1]) assert_array_equal(pred_score_top_rules, 6 * [0] + 2 * [1])
def main(): mail = get_feat_scores() #panda table train, test = train_test_split(mail, test_size=0.3) #split up data x_train = train.drop(columns=['label']) #remove labels from test x y_train = train.drop(columns=['message', 'sf', 'hf']) cv = CountVectorizer(input='content', stop_words=stp.words('english'), ngram_range=(1, 2)) x_tr = cv.fit_transform( x_train.message) #vectorize x_train text for algorithm skr = SkopeRules(n_estimators=30, feature_names=['sf', 'hf']) #algorithm y_train = y_train.to_numpy().ravel( ) #turn y_train into a 1d array for algorithm y_train = y_train.astype('int') skr.fit(x_tr.toarray(), y_train) #test data x_test = train.drop(columns=['label']) y_test = train.drop(columns=['message', 'sf', 'hf']) x_tst = cv.transform(x_test.message) y_test = y_test.to_numpy().ravel() y_test = y_test.astype('int') y_score = skr.score_top_rules(x_tst.toarray()) #metrics recall_scr = recall_score(y_test, y_score, average='micro') f1_scr = f1_score(y_test, y_score, average='micro') pr_score = precision_score(y_test, y_score, average='micro') print("recall: " + str(recall_scr)) print("f1: " + str(f1_scr)) print("precision: " + str(pr_score)) #plot precision, recall, r = precision_recall_curve(y_test, y_score) plt.plot(recall, precision) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision Recall curve') plt.show()
clf = SkopeRules(max_depth_duplication=3, max_depth=3, max_features=0.5, max_samples_features=0.5, random_state=rng, n_estimators=20, feature_names=feature_names, recall_min=0.04, precision_min=0.6) clf.fit(X_train, y_train) # in the score_top_rules method, a score of k means that rule number k # vote positively, but not rules 1, ..., k-1. It will allow us to plot # performance of each rule separately on the ROC and PR plots. scoring = clf.score_top_rules(X_test) print(str(len(clf.rules_)) + ' rules have been built.') print('The 5 most precise rules are the following:') for rule in clf.rules_[:5]: print(rule[0]) curves = [roc_curve, precision_recall_curve] xlabels = ['False Positive Rate', 'Recall (True Positive Rate)'] ylabels = ['True Positive Rate (Recall)', 'Precision'] fig, axes = plt.subplots(1, 2, figsize=(12, 5), sharex=True, sharey=True) ax = axes[0] fpr, tpr, _ = roc_curve(y_test, scoring) fpr_rf, tpr_rf, _ = roc_curve(y_test, scoring_rf)
from sklearn.datasets import load_boston from sklearn.metrics import precision_recall_curve from matplotlib import pyplot as plt from skrules import SkopeRules import seaborn as sns dataset = load_boston() clf = SkopeRules(max_depth_duplication=None, n_estimators=30, precision_min=0.2, recall_min=0.01, feature_names=dataset.feature_names) X, y = dataset.data, dataset.target > 25 X_train, y_train = X[:len(y)//2], y[:len(y)//2] X_test, y_test = X[len(y)//2:], y[len(y)//2:] clf.fit(X_train, y_train) y_score = clf.score_top_rules(X_test) precision, recall, _ = precision_recall_curve(y_test, y_score) print(recall) print("Precision",precision) plt.plot(recall, precision) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision Recall curve') plt.show() ax=sns.barplot(recall,precision) ax.set(xlabel ='Recall', ylabel ='Precision') plt.show()