Beispiel #1
0
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0):
    ret = select_sets(df, sets)
    y = pd.DataFrame({"label":0}, index=df.index)
    for label, rows in enumerate(ret):
        y.loc[rows] = label + 1
    y = y[y["label"] > 0]
    y -= 1

    clf = CDClassifier(penalty="l1/l2",
                       loss="squared_hinge",
                       multiclass=len(sets) > 2,
                       max_iter=20,
                       alpha=alpha,
                       C=1.0 / motifs.shape[0],
                       tol=1e-3)

    accs = []
    fractions = []

    for i in np.arange(k):

        idx = np.random.choice(range(y.shape[0]), nsample, replace=True)

        y_pred = y.iloc[idx[:nsample * 0.8 + 1]]
        X_pred = motifs.loc[y_pred.index].values
        y_pred = y_pred.values.flatten()

        y_test = y.iloc[idx[nsample * 0.8 + 1:]]
        X_test = motifs.loc[y_test.index].values
        y_test = y_test.values.flatten()

        # train the model
        clf.fit(X_pred, y_pred)

        acc = clf.score(X_test, y_test)
        fraction = clf.n_nonzero(percentage=True)

        accs.append(acc)
        fractions.append(fraction)

    #print alpha, accs, fractions
    return alpha, np.median(accs), np.median(fractions)
Beispiel #2
0
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0):
    ret = select_sets(df, sets)
    y = pd.DataFrame({"label": 0}, index=df.index)
    for label, rows in enumerate(ret):
        y.loc[rows] = label + 1
    y = y[y["label"] > 0]
    y -= 1

    clf = CDClassifier(penalty="l1/l2",
                       loss="squared_hinge",
                       multiclass=len(sets) > 2,
                       max_iter=20,
                       alpha=alpha,
                       C=1.0 / motifs.shape[0],
                       tol=1e-3)

    accs = []
    fractions = []

    for i in np.arange(k):

        idx = np.random.choice(range(y.shape[0]), nsample, replace=True)

        y_pred = y.iloc[idx[:nsample * 0.8 + 1]]
        X_pred = motifs.loc[y_pred.index].values
        y_pred = y_pred.values.flatten()

        y_test = y.iloc[idx[nsample * 0.8 + 1:]]
        X_test = motifs.loc[y_test.index].values
        y_test = y_test.values.flatten()

        # train the model
        clf.fit(X_pred, y_pred)

        acc = clf.score(X_test, y_test)
        fraction = clf.n_nonzero(percentage=True)

        accs.append(acc)
        fractions.append(fraction)

    #print alpha, accs, fractions
    return alpha, np.median(accs), np.median(fractions)
#Source
#http://contrib.scikit-learn.org/lightning/

from sklearn.datasets import fetch_20newsgroups_vectorized
from lightning.classification import CDClassifier

# Load News20 dataset from scikit-learn.
bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target

# Set classifier options.
clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=20,
                   alpha=1e-4,
                   C=1.0 / X.shape[0],
                   tol=1e-3)

# Train the model.
clf.fit(X, y)

# Accuracy
print(clf.score(X, y))

# Percentage of selected features
print(clf.n_nonzero(percentage=True))