def test_deduplication_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF clf = SkopeRules(random_state=rng, max_samples=1., max_depth_duplication=3) clf.fit(X, y) decision_func = clf.decision_function(X_test) rules_vote = clf.rules_vote(X_test) score_top_rules = clf.score_top_rules(X_test) pred = clf.predict(X_test) pred_score_top_rules = clf.predict_top_rules(X_test, 1)
def test_skope_rules_works(): # toy sample (the last two samples are outliers) X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]] y = [0] * 6 + [1] * 2 X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5], [5, -7]] # Test LOF clf = SkopeRules(random_state=rng, max_samples=1.) clf.fit(X, y) decision_func = clf.decision_function(X_test) rules_vote = clf.rules_vote(X_test) separate_rules_score = clf.separate_rules_score(X_test) pred = clf.predict(X_test) # assert detect outliers: assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2])) assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2])) assert_greater(np.min(separate_rules_score[-2:]), np.max(separate_rules_score[:-2])) assert_array_equal(pred, 6 * [0] + 2 * [1])
def OverSampleKfold(data, k=10): data_np = data.to_numpy() X = data_np[:, 0:18] y = data_np[:, 18] sm = SMOTE() kf = KFold(n_splits=k, shuffle=True) accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #test set print('num of pos:', np.sum(y_train), ', num of neg:', y_train.size - np.sum(y_train)) X_over, y_over = sm.fit_resample(X_train, y_train) #oversampled train set print('oversample:', X_over.shape, y_over.shape) print('---------------------------------------') neigh = KNeighborsClassifier() neigh.fit(X_over, y_over) neigh_y_pred = neigh.predict(X_test) neigh_y_score = neigh.predict_proba(X_test)[:, 1] #nei scorer recall['neigh'].append( recall_score(y_test, neigh_y_pred, labels=None, pos_label=1)) accuracy['neigh'].append( accuracy_score(y_test, neigh_y_pred, normalize=True, sample_weight=None)) auc['neigh'].append(roc_auc_score(y_test, neigh_y_score)) print('---------') tree = DecisionTreeClassifier() tree.fit(X_over, y_over) tree_y_pred = tree.predict(X_test) tree_y_score = tree.predict_proba(X_test)[:, 1] #nei scorer recall['tree'].append( recall_score(y_test, tree_y_pred, labels=None, pos_label=1)) accuracy['tree'].append( accuracy_score(y_test, tree_y_pred, normalize=True, sample_weight=None)) auc['tree'].append(roc_auc_score(y_test, tree_y_score)) print('---------') naive = GaussianNB() naive.fit(X_over, y_over) naive_y_pred = naive.predict(X_test) naive_y_score = naive.predict_proba(X_test)[:, 1] #nei scorer recall['naive'].append( recall_score(y_test, naive_y_pred, labels=None, pos_label=1)) accuracy['naive'].append( accuracy_score(y_test, naive_y_pred, normalize=True, sample_weight=None)) auc['naive'].append(roc_auc_score(y_test, naive_y_score)) print('---------') rule = SkopeRules(feature_names=data.columns.to_list()[0:18]) rule.fit(X_over, y_over) rules_y_pred = rule.predict(X_test) rule_y_score = rule.rules_vote(X_test) recall['rule'].append( recall_score(y_test, rules_y_pred, labels=None, pos_label=1)) accuracy['rule'].append( accuracy_score(y_test, rules_y_pred, normalize=True, sample_weight=None)) auc['rule'].append(roc_auc_score(y_test, rule_y_score)) return accuracy, recall, auc
def BalanceSampleKfold(data, k=10): data_np = data.to_numpy() X = data_np[:, 0:18] y = data_np[:, 18] rus_balance = RandomUnderSampler( sampling_strategy=0.20) #truncate neg to 5*#pos sm_balance = SMOTE() #then oversample pos kf = KFold(n_splits=10, shuffle=True) accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []} for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] #test set print('num of pos:', np.sum(y_train), ', num of neg:', y_train.size - np.sum(y_train)) X_bal, y_bal = rus_balance.fit_resample(X_train, y_train) #BALANCED SAMPLE print('1.under:') print('num of pos:', np.sum(y_bal), ', num of neg:', y_bal.size - np.sum(y_bal)) X_bal, y_bal = sm_balance.fit_resample(X_bal, y_bal) print('2.over:') print('num of pos:', np.sum(y_bal), ', num of neg:', y_bal.size - np.sum(y_bal)) print('---------------------------------------') neigh = KNeighborsClassifier() neigh.fit(X_bal, y_bal) neigh_y_pred = neigh.predict(X_test) neigh_y_score = neigh.predict_proba(X_test)[:, 1] #nei scorer recall['neigh'].append( recall_score(y_test, neigh_y_pred, labels=None, pos_label=1)) accuracy['neigh'].append( accuracy_score(y_test, neigh_y_pred, normalize=True, sample_weight=None)) auc['neigh'].append(roc_auc_score(y_test, neigh_y_score)) print('---------') tree = DecisionTreeClassifier() tree.fit(X_bal, y_bal) tree_y_pred = tree.predict(X_test) tree_y_score = tree.predict_proba(X_test)[:, 1] #nei scorer recall['tree'].append( recall_score(y_test, tree_y_pred, labels=None, pos_label=1)) accuracy['tree'].append( accuracy_score(y_test, tree_y_pred, normalize=True, sample_weight=None)) auc['tree'].append(roc_auc_score(y_test, tree_y_score)) print('---------') naive = GaussianNB() naive.fit(X_bal, y_bal) naive_y_pred = naive.predict(X_test) naive_y_score = naive.predict_proba(X_test)[:, 1] #nei scorer recall['naive'].append( recall_score(y_test, naive_y_pred, labels=None, pos_label=1)) accuracy['naive'].append( accuracy_score(y_test, naive_y_pred, normalize=True, sample_weight=None)) auc['naive'].append(roc_auc_score(y_test, naive_y_score)) print('---------') rule = SkopeRules(feature_names=data.columns.to_list()[0:18]) rule.fit(X_bal, y_bal) rules_y_pred = rule.predict(X_test) rule_y_score = rule.rules_vote(X_test) recall['rule'].append( recall_score(y_test, rules_y_pred, labels=None, pos_label=1)) accuracy['rule'].append( accuracy_score(y_test, rules_y_pred, normalize=True, sample_weight=None)) auc['rule'].append(roc_auc_score(y_test, rule_y_score)) return accuracy, recall, auc