Example #1
0
def test_can_predict():
    clf = SkopeRules(regression=True,
                     max_depth_duplication=2,
                     n_estimators=30,
                     precision_min=0.20,
                     recall_min=0.20,
                     feature_names=feature_names)
    clf.fit(X, y)
    clf.predict(X)
Example #2
0
def test_performance_not_deteriorate():
    '''Compare the model performance to baselines.

    It's a bit unclear what to compare against since performance
    varies widely across models (in mse; vanilla settings):
    decision tree regressor: 6946
    random forest regressor: 2970
    linear model: 2820
    '''
    clf = SkopeRules(regression=True,
                     max_depth_duplication=None,
                     max_depth=X.shape[1] // 3,
                     n_estimators=850,
                     precision_min=0.,
                     recall_min=.0,
                     feature_names=feature_names)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    # comparing to a baseline from linear regression:
    assert mse < 2820
Example #3
0
def test_performances():
    X, y = make_blobs(n_samples=1000, random_state=0, centers=2)

    # make labels imbalanced by remove all but 100 instances from class 1
    indexes = np.ones(X.shape[0]).astype(bool)
    ind = np.array([False] * 100 + list(((y == 1)[100:])))
    indexes[ind] = 0
    X = X[indexes]
    y = y[indexes]
    n_samples, n_features = X.shape

    clf = SkopeRules()
    # fit
    clf.fit(X, y)
    # with lists
    clf.fit(X.tolist(), y.tolist())
    y_pred = clf.predict(X)
    assert_equal(y_pred.shape, (n_samples, ))
    # training set performance
    assert_greater(accuracy_score(y, y_pred), 0.83)

    # decision_function agrees with predict
    decision = -clf.decision_function(X)
    assert_equal(decision.shape, (n_samples, ))
    dec_pred = (decision.ravel() < 0).astype(np.int)
    assert_array_equal(dec_pred, y_pred)
Example #4
0
def test_deduplication_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]]
    y = [0] * 6 + [1] * 2
    X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5],
              [5, -7]]
    # Test LOF
    clf = SkopeRules(random_state=rng, max_samples=1., max_depth_duplication=3)
    clf.fit(X, y)
    decision_func = clf.decision_function(X_test)
    rules_vote = clf.rules_vote(X_test)
    score_top_rules = clf.score_top_rules(X_test)
    pred = clf.predict(X_test)
    pred_score_top_rules = clf.predict_top_rules(X_test, 1)
def KfoldAcc(X, y, multiclass=False, k=10):  #then oversample pos
    kf = KFold(n_splits=10, shuffle=True)
    accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  #test set

        neigh = KNeighborsClassifier()
        neigh.fit(X_train, y_train)
        neigh_y_pred = neigh.predict(X_test)
        accuracy['neigh'].append(
            accuracy_score(y_test,
                           neigh_y_pred,
                           normalize=True,
                           sample_weight=None))

        print('---------')
        tree = DecisionTreeClassifier()
        tree.fit(X_train, y_train)
        tree_y_pred = tree.predict(X_test)
        accuracy['tree'].append(
            accuracy_score(y_test,
                           tree_y_pred,
                           normalize=True,
                           sample_weight=None))

        print('---------')
        naive = GaussianNB()
        naive.fit(X_train, y_train)
        naive_y_pred = naive.predict(X_test)
        accuracy['naive'].append(
            accuracy_score(y_test,
                           naive_y_pred,
                           normalize=True,
                           sample_weight=None))

        print('---------')
        rule = SkopeRules()
        if multiclass is True:
            rule = OneVsRestClassifier(rule)
        rule.fit(X_train, y_train)
        rules_y_pred = rule.predict(X_test)
        accuracy['rule'].append(
            accuracy_score(y_test,
                           rules_y_pred,
                           normalize=True,
                           sample_weight=None))

    return accuracy
Example #6
0
def test_skope_rules_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]]
    y = [0] * 6 + [1] * 2
    X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5],
              [5, -7]]
    # Test LOF
    clf = SkopeRules(random_state=rng, max_samples=1.)
    clf.fit(X, y)
    decision_func = clf.decision_function(X_test)
    rules_vote = clf.rules_vote(X_test)
    separate_rules_score = clf.separate_rules_score(X_test)
    pred = clf.predict(X_test)
    # assert detect outliers:
    assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
    assert_greater(np.min(rules_vote[-2:]), np.max(rules_vote[:-2]))
    assert_greater(np.min(separate_rules_score[-2:]),
                   np.max(separate_rules_score[:-2]))
    assert_array_equal(pred, 6 * [0] + 2 * [1])
Example #7
0
    rule = SkopeRules(feature_names= data.columns.to_list()[0:18])
    
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    accuracy_no['neigh'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
    
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    accuracy_no['tree'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
    
    naive.fit(X_train, y_train)
    y_pred = naive.predict(X_test)
    accuracy_no['naive'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
    
    rule.fit(X_train, y_train)
    y_pred = rule.predict(X_test)
    accuracy_no['rule'].append(accuracy_score(y_test, y_pred, normalize=True, sample_weight=None))
    
    
#feature sel w/ boruta
for train_index, test_index in kf.split(X1):
    X_train, X_test = X1[train_index], X1[test_index]
    y_train, y_test = y_bal[train_index], y_bal[test_index]    #test set
    
    neigh = KNeighborsClassifier()  ##
    tree = DecisionTreeClassifier()
    naive = GaussianNB()
    rule = SkopeRules()
    
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
Example #8
0
def OverSampleKfold(data, k=10):
    data_np = data.to_numpy()
    X = data_np[:, 0:18]
    y = data_np[:, 18]
    sm = SMOTE()
    kf = KFold(n_splits=k, shuffle=True)
    accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  #test set
        print('num of pos:', np.sum(y_train), ', num of neg:',
              y_train.size - np.sum(y_train))

        X_over, y_over = sm.fit_resample(X_train,
                                         y_train)  #oversampled train set
        print('oversample:', X_over.shape, y_over.shape)
        print('---------------------------------------')

        neigh = KNeighborsClassifier()
        neigh.fit(X_over, y_over)
        neigh_y_pred = neigh.predict(X_test)
        neigh_y_score = neigh.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['neigh'].append(
            recall_score(y_test, neigh_y_pred, labels=None, pos_label=1))
        accuracy['neigh'].append(
            accuracy_score(y_test,
                           neigh_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['neigh'].append(roc_auc_score(y_test, neigh_y_score))

        print('---------')
        tree = DecisionTreeClassifier()
        tree.fit(X_over, y_over)
        tree_y_pred = tree.predict(X_test)
        tree_y_score = tree.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['tree'].append(
            recall_score(y_test, tree_y_pred, labels=None, pos_label=1))
        accuracy['tree'].append(
            accuracy_score(y_test,
                           tree_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['tree'].append(roc_auc_score(y_test, tree_y_score))

        print('---------')
        naive = GaussianNB()
        naive.fit(X_over, y_over)
        naive_y_pred = naive.predict(X_test)
        naive_y_score = naive.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['naive'].append(
            recall_score(y_test, naive_y_pred, labels=None, pos_label=1))
        accuracy['naive'].append(
            accuracy_score(y_test,
                           naive_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['naive'].append(roc_auc_score(y_test, naive_y_score))

        print('---------')
        rule = SkopeRules(feature_names=data.columns.to_list()[0:18])
        rule.fit(X_over, y_over)
        rules_y_pred = rule.predict(X_test)
        rule_y_score = rule.rules_vote(X_test)
        recall['rule'].append(
            recall_score(y_test, rules_y_pred, labels=None, pos_label=1))
        accuracy['rule'].append(
            accuracy_score(y_test,
                           rules_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['rule'].append(roc_auc_score(y_test, rule_y_score))

    return accuracy, recall, auc
Example #9
0
def BalanceSampleKfold(data, k=10):
    data_np = data.to_numpy()
    X = data_np[:, 0:18]
    y = data_np[:, 18]
    rus_balance = RandomUnderSampler(
        sampling_strategy=0.20)  #truncate neg to 5*#pos
    sm_balance = SMOTE()  #then oversample pos
    kf = KFold(n_splits=10, shuffle=True)

    accuracy = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    recall = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}
    auc = {'neigh': [], 'tree': [], 'naive': [], 'rule': []}

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]  #test set
        print('num of pos:', np.sum(y_train), ', num of neg:',
              y_train.size - np.sum(y_train))
        X_bal, y_bal = rus_balance.fit_resample(X_train,
                                                y_train)  #BALANCED SAMPLE
        print('1.under:')
        print('num of pos:', np.sum(y_bal), ', num of neg:',
              y_bal.size - np.sum(y_bal))
        X_bal, y_bal = sm_balance.fit_resample(X_bal, y_bal)
        print('2.over:')
        print('num of pos:', np.sum(y_bal), ', num of neg:',
              y_bal.size - np.sum(y_bal))
        print('---------------------------------------')

        neigh = KNeighborsClassifier()
        neigh.fit(X_bal, y_bal)
        neigh_y_pred = neigh.predict(X_test)
        neigh_y_score = neigh.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['neigh'].append(
            recall_score(y_test, neigh_y_pred, labels=None, pos_label=1))
        accuracy['neigh'].append(
            accuracy_score(y_test,
                           neigh_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['neigh'].append(roc_auc_score(y_test, neigh_y_score))

        print('---------')
        tree = DecisionTreeClassifier()
        tree.fit(X_bal, y_bal)
        tree_y_pred = tree.predict(X_test)
        tree_y_score = tree.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['tree'].append(
            recall_score(y_test, tree_y_pred, labels=None, pos_label=1))
        accuracy['tree'].append(
            accuracy_score(y_test,
                           tree_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['tree'].append(roc_auc_score(y_test, tree_y_score))

        print('---------')
        naive = GaussianNB()
        naive.fit(X_bal, y_bal)
        naive_y_pred = naive.predict(X_test)
        naive_y_score = naive.predict_proba(X_test)[:, 1]
        #nei scorer
        recall['naive'].append(
            recall_score(y_test, naive_y_pred, labels=None, pos_label=1))
        accuracy['naive'].append(
            accuracy_score(y_test,
                           naive_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['naive'].append(roc_auc_score(y_test, naive_y_score))

        print('---------')
        rule = SkopeRules(feature_names=data.columns.to_list()[0:18])
        rule.fit(X_bal, y_bal)
        rules_y_pred = rule.predict(X_test)
        rule_y_score = rule.rules_vote(X_test)
        recall['rule'].append(
            recall_score(y_test, rules_y_pred, labels=None, pos_label=1))
        accuracy['rule'].append(
            accuracy_score(y_test,
                           rules_y_pred,
                           normalize=True,
                           sample_weight=None))
        auc['rule'].append(roc_auc_score(y_test, rule_y_score))

    return accuracy, recall, auc
# Train the network, then make predictions on the test set and print the results
neural_network = compute_semi_supervised_learning(neural_network, X_train,
                                                  Y_train)
neural_network_pred = np.array(neural_network.predict_classes(
    np.array(X_test)))
print_statistics("Semi-Supervised Neural Network", neural_network_pred, Y_test)

# ************* Rule Model:  ************************
# Here we compare 3 nearest neighbour models on the validation set
# First skope rules model
rule_clf1 = SkopeRules(n_estimators=50,
                       precision_min=0.2,
                       recall_min=0.2,
                       feature_names=feature_names)
rule_clf1.fit(X_val, Y_val)
rule_clf1_ypred = rule_clf1.predict(X_test_val)
print_statistics("Rule Classifier - Skope Rules - Model 1", rule_clf1_ypred,
                 Y_test_val)
# Second skope rules model
rule_clf2 = SkopeRules(n_estimators=50,
                       precision_min=0.2,
                       recall_min=0.2,
                       feature_names=feature_names)
rule_clf2.fit(X_val, Y_val)
rule_clf2_ypred = rule_clf2.predict(X_test_val)
print_statistics("Rule Classifier - Skope Rules - Model 2", rule_clf2_ypred,
                 Y_test_val)
# Third skope rules model
rule_clf3 = SkopeRules(n_estimators=25,
                       precision_min=0.2,
                       recall_min=0.2,