Ejemplo n.º 1
0
def test_performances():
    X, y = make_blobs(n_samples=1000, random_state=0, centers=2)

    # make labels imbalanced by remove all but 100 instances from class 1
    indexes = np.ones(X.shape[0]).astype(bool)
    ind = np.array([False] * 100 + list(((y == 1)[100:])))
    indexes[ind] = 0
    X = X[indexes]
    y = y[indexes]
    n_samples, n_features = X.shape

    clf = FraudToRules()
    # fit
    clf.fit(X, y)
    # with lists
    clf.fit(X.tolist(), y.tolist())
    y_pred = clf.predict(X)
    assert_equal(y_pred.shape, (n_samples, ))
    # training set performance
    assert_greater(accuracy_score(y, y_pred), 0.83)

    # decision_function agrees with predict
    decision = -clf.decision_function(X)
    assert_equal(decision.shape, (n_samples, ))
    dec_pred = (decision.ravel() < 0).astype(np.int)
    assert_array_equal(dec_pred, y_pred)
Ejemplo n.º 2
0
def test_max_samples_attribute():
    X = iris.data
    y = iris.target
    y = (y != 0)

    clf = FraudToRules(max_samples=1.).fit(X, y)
    assert_equal(clf.max_samples_, X.shape[0])

    clf = FraudToRules(max_samples=500)
    assert_warns_message(
        UserWarning, "max_samples will be set to n_samples for estimation",
        clf.fit, X, y)
    assert_equal(clf.max_samples_, X.shape[0])

    clf = FraudToRules(max_samples=0.4).fit(X, y)
    assert_equal(clf.max_samples_, 0.4 * X.shape[0])
Ejemplo n.º 3
0
def test_fraudetorules():
    """Check various parameter settings."""
    X_train = np.array([[0, 1], [1, 2]])
    y_train = np.array([0, 1])
    X_test = np.array([[2, 1], [1, 1]])

    grid = ParameterGrid({
        "feature_names": [None, ['a', 'b']],
        "precision_min": [0.1],
        "recall_min": [0.1],
        "n_estimators": [1],
        "max_samples": [0.5, 3],
        "max_samples_features": [0.5, 2],
        "bootstrap": [True, False],
        "bootstrap_features": [True, False],
        "max_depth": [2],
        "max_features": ["auto", 1, 0.1],
        "min_samples_split": [2, 0.1],
        "n_jobs": [-1, 1]
    })

    with ignore_warnings():
        for params in grid:
            FraudToRules(random_state=rng,
                         **params).fit(X_train, y_train).predict(X_test)
Ejemplo n.º 4
0
def test_fraudetorules_error():
    """Test that it gives proper exception on deficient input."""
    X = iris.data
    y = iris.target
    y = (y != 0)

    # Test max_samples
    assert_raises(ValueError, FraudToRules(max_samples=-1).fit, X, y)
    assert_raises(ValueError, FraudToRules(max_samples=0.0).fit, X, y)
    assert_raises(ValueError, FraudToRules(max_samples=2.0).fit, X, y)
    # explicitly setting max_samples > n_samples should result in a warning.
    assert_warns_message(
        UserWarning, "max_samples will be set to n_samples for estimation",
        FraudToRules(max_samples=1000).fit, X, y)
    assert_no_warnings(FraudToRules(max_samples=np.int64(2)).fit, X, y)
    assert_raises(ValueError, FraudToRules(max_samples='foobar').fit, X, y)
    assert_raises(ValueError, FraudToRules(max_samples=1.5).fit, X, y)
    assert_raises(ValueError, FraudToRules().fit(X, y).predict, X[:, 1:])
Ejemplo n.º 5
0
def test_fraudetorules_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [4, -7]]
    y = [0] * 6 + [1] * 2
    X_test = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [10, 5],
              [5, -7]]
    # Test LOF
    clf = FraudToRules(random_state=rng, max_samples=1.)
    clf.fit(X, y)
    decision_func = -clf.decision_function(X_test)
    pred = clf.predict(X_test)
    # assert detect outliers:
    assert_greater(np.max(decision_func[:-2]), np.min(decision_func[-2:]))
    assert_array_equal(pred, 6 * [0] + 2 * [1])
Ejemplo n.º 6
0
rng = np.random.RandomState(42)

n_inliers = 1000
n_outliers = 50

# Generate train data
I = 0.5 * rng.randn(int(n_inliers / 2), 2)
X_inliers = np.r_[I + 2, I - 2]
O = 0.5 * rng.randn(n_outliers, 2)
X_outliers = O  # np.r_[O, O + [2, -2]]
X_train = np.r_[X_inliers, X_outliers]
y_train = [0] * n_inliers + [1] * n_outliers

# fit the model
clf = FraudToRules(random_state=rng, n_estimators=100)
clf.fit(X_train, y_train)

# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("Fraud To Rules")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

a = plt.scatter(X_inliers[:, 0],
                X_inliers[:, 1],
                c='white',
                s=20,
                edgecolor='k')
Ejemplo n.º 7
0
feature_names[4] = 'accepteur_ZBBIOHSD'
data.columns = feature_names
print(feature_names)
data = data.values

n_samples = data.shape[0]
n_samples_train = int(n_samples / 2)
y_train = target[:n_samples_train]
y_test = target[n_samples_train:]
X_train = data[:n_samples_train]
X_test = data[n_samples_train:]

# fit the model
clf = FraudToRules(max_depth=2,
                   max_features=0.5,
                   max_samples_features=0.5,
                   random_state=rng,
                   n_estimators=10,
                   feature_names=feature_names)
clf.fit(X_train, y_train)
RF = RandomForestClassifier()
RF.fit(X_train, y_train)

scoring = clf.decision_function(X_test)

scoring_RF = RF.predict_proba(X_test)[:, 1]
scoring_one_rule = np.zeros(X_test.shape[0])
rule = clf.rules_[0][0]
detected_index = list(
    pd.DataFrame(X_test, columns=feature_names).query(rule).index)
scoring_one_rule[detected_index] = 1
print('best rule precision:', y_test[detected_index].mean())
Ejemplo n.º 8
0
    if dat in ('http', 'smtp'):
        y = (y != b'normal.').astype(int)
        print_outlier_ratio(y)

    n_samples, n_features = X.shape
    n_samples_train = n_samples // 2
    import pdb
    pdb.set_trace()
    X = X.astype(float)
    X_train = X[:n_samples_train, :]
    X_test = X[n_samples_train:, :]
    y_train = y[:n_samples_train]
    y_test = y[n_samples_train:]

    print('--- Fitting the FraudToRules estimator...')
    model = FraudToRules(n_estimators=5, max_depth=5, n_jobs=-1)
    tstart = time()
    model.fit(X_train, y_train)
    fit_time = time() - tstart
    tstart = time()

    scoring = -model.decision_function(X_test)  # the lower, the more abnormal

    print("--- Preparing the plot elements...")
    if with_decision_function_histograms:
        fig, ax = plt.subplots(3, sharex=True, sharey=True)
        bins = np.linspace(-0.5, 0.5, 200)
        ax[0].hist(scoring, bins, color='black')
        ax[0].set_title('Decision function for %s dataset' % dat)
        ax[1].hist(scoring[y_test == 0], bins, color='b', label='normal data')
        ax[1].legend(loc="lower right")