Beispiel #1
0
def stack(X, y, X_test, y_test):
    X, X1, y, y1 = train_test_split(X, y, test_size=0.5)
    #clf1 = GradientBoostingClassifier(n_estimators=10)
    #clf1 = RandomForestClassifier(n_estimators=20)
    clf1 = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0)
    clf2 = linear_model.SGDClassifier(loss='log')
    enc = OneHotEncoder()
    #clf2 = RandomForestClassifier(n_estimators=10)
    #clf2 = GradientBoostingClassifier(n_estimators=20)
    clf1.fit(X, y)
    enc.fit(clf1.apply(X))
    clf2.fit(enc.transform(clf1.apply(X1)), y1)

    #prob = clf2.predict_proba(enc.transform(clf1.apply(X_test)[:, :, 0]))[:, 1]

    prob = clf2.predict_proba(enc.transform(clf1.apply(X_test)).toarray())[:, 1]
    res = clf2.predict(enc.transform(clf1.apply(X_test)))        
    check = zip(y_test, res)
    tp, tn, fp, fn = 0, 0, 0, 0
    for value, prediction in check:
        if (prediction and value):
            tp += 1
        if (prediction and not value):
            fp += 1
        if (not prediction and value):
            fn += 1
        if (not prediction and not value):
            tn += 1
    print ('TP: {0}, TN: {1}, FP: {2}, FN: {3}'.format(tp, tn, fp, fn))
    print ("Precision Score : %f" % metrics.precision_score(y_test, res))
    print ("Recall Score : %f" % metrics.recall_score(y_test, res))
    return roc_curve(y_test, prob)
Beispiel #2
0
def Extreme_rf_dis(n_trees, X, Y, train_indices, test_indices, seed):
    clf = ExtraTreesClassifier(n_estimators=500,
                               random_state=seed,
                               oob_score=True,
                               n_jobs=-1)
    clf = clf.fit(X[train_indices], Y[train_indices])
    pred = clf.predict(X[test_indices])
    weight = clf.score(X[test_indices], Y[test_indices])
    #print(1 - clf.oob_score_)
    n_samples = X.shape[0]
    dis = np.zeros((n_samples, n_samples))
    for i in range(n_samples):
        dis[i][i] = 1
    res = clf.apply(X)
    for i in range(n_samples):
        for j in range(i + 1, n_samples):
            a = np.ravel(res[i])
            b = np.ravel(res[j])
            score = a == b
            d = float(score.sum()) / n_trees
            dis[i][j] = dis[j][i] = d
    X_features1 = np.transpose(dis)
    X_features2 = X_features1[train_indices]
    X_features3 = np.transpose(X_features2)
    return X_features3[train_indices], X_features3[test_indices], weight, pred
Beispiel #3
0
X = training_data.to_numpy()

reducer = umap.UMAP(random_state=42)
embedding = reducer.fit_transform(X)
plt.figure()
plt.scatter(embedding[:, 0], embedding[:, 1], c=y, cmap="Spectral", s=8)
plt.gca().set_aspect("equal", "datalim")
cb = plt.colorbar()
loc = np.arange(0, max(y) + 0.5, 1)
cb.set_ticks(loc)
plt.title("UMAP projection of Titanic dataset")

# Use Extra Trees Classifier Embedding
model = ExtraTreesClassifier(n_estimators=100, min_samples_leaf=10)
model.fit(X, y)
leaves = model.apply(X)
reducer = umap.UMAP(metric='hamming', random_state=42)
embedding = reducer.fit_transform(leaves)
# plotting the embedding
plt.figure()
plt.scatter(embedding[:, 0], embedding[:, 1], c=y, cmap="Spectral", s=8)
plt.gca().set_aspect("equal", "datalim")
cb = plt.colorbar()
loc = np.arange(0, max(y) + 0.5, 1)
cb.set_ticks(loc)
plt.title(
    "UMAP Projection of Titanic Dataset\n Using Extra Trees Classifier Embedding"
)

# Use DecisionTreeClassifier Embedding
model = DecisionTreeClassifier(max_leaf_nodes=2)