Beispiel #1
0
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0):
    ret = select_sets(df, sets)
    y = pd.DataFrame({"label":0}, index=df.index)
    for label, rows in enumerate(ret):
        y.loc[rows] = label + 1
    y = y[y["label"] > 0]
    y -= 1

    clf = CDClassifier(penalty="l1/l2",
                       loss="squared_hinge",
                       multiclass=len(sets) > 2,
                       max_iter=20,
                       alpha=alpha,
                       C=1.0 / motifs.shape[0],
                       tol=1e-3)

    accs = []
    fractions = []

    for i in np.arange(k):

        idx = np.random.choice(range(y.shape[0]), nsample, replace=True)

        y_pred = y.iloc[idx[:nsample * 0.8 + 1]]
        X_pred = motifs.loc[y_pred.index].values
        y_pred = y_pred.values.flatten()

        y_test = y.iloc[idx[nsample * 0.8 + 1:]]
        X_test = motifs.loc[y_test.index].values
        y_test = y_test.values.flatten()

        # train the model
        clf.fit(X_pred, y_pred)

        acc = clf.score(X_test, y_test)
        fraction = clf.n_nonzero(percentage=True)

        accs.append(acc)
        fractions.append(fraction)

    #print alpha, accs, fractions
    return alpha, np.median(accs), np.median(fractions)
Beispiel #2
0
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0):
    ret = select_sets(df, sets)
    y = pd.DataFrame({"label": 0}, index=df.index)
    for label, rows in enumerate(ret):
        y.loc[rows] = label + 1
    y = y[y["label"] > 0]
    y -= 1

    clf = CDClassifier(penalty="l1/l2",
                       loss="squared_hinge",
                       multiclass=len(sets) > 2,
                       max_iter=20,
                       alpha=alpha,
                       C=1.0 / motifs.shape[0],
                       tol=1e-3)

    accs = []
    fractions = []

    for i in np.arange(k):

        idx = np.random.choice(range(y.shape[0]), nsample, replace=True)

        y_pred = y.iloc[idx[:nsample * 0.8 + 1]]
        X_pred = motifs.loc[y_pred.index].values
        y_pred = y_pred.values.flatten()

        y_test = y.iloc[idx[nsample * 0.8 + 1:]]
        X_test = motifs.loc[y_test.index].values
        y_test = y_test.values.flatten()

        # train the model
        clf.fit(X_pred, y_pred)

        acc = clf.score(X_test, y_test)
        fraction = clf.n_nonzero(percentage=True)

        accs.append(acc)
        fractions.append(fraction)

    #print alpha, accs, fractions
    return alpha, np.median(accs), np.median(fractions)
one_over_n = 1. / float(n_samples)
ds = ColumnData(X)
coefs_ = np.zeros((n_features, n_classes))

fit( ds, y, one_over_n, n_samples, n_features, n_classes,coefs_,groups)
s =  score (X,y,coefs_)
print "score = ", s

print '======================================================'

clf_max_iter=300
clf_tol = 1e-3
print "### Equivalent Lightning Cython Implementation ###"
light_clf = CDClassifier(penalty="l1/l2",
                         loss="squared_hinge",
                         multiclass=True,
                         max_iter=clf_max_iter,
                         alpha=0.5, # clf.alpha,
                         C=1.0 / X.shape[0],
                         tol=clf_tol,
                         permute=False,
                         verbose=3,
                         random_state=0).fit(X, y)
print "Acc:", light_clf.score(X, y)
print light_clf.coef_.T





import time

import numpy as np

from sklearn.datasets import fetch_20newsgroups_vectorized
from lightning.classification import CDClassifier

bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target
y[y >= 1] = 1

Cs = np.logspace(-3, 3, 20)

for warm_start in (True, False):
    clf = CDClassifier(loss="squared_hinge",
                       tol=1e-3,
                       max_iter=100,
                       warm_start=warm_start)

    scores = []
    start = time.time()
    for C in Cs:
        clf.C = C
        clf.fit(X, y)
        scores.append(clf.score(X, y))

    print "Total time", time.time() - start
    print "Average accuracy", np.mean(scores)
        # build model
        # cross validation
        for C in [1, 0.1, 0.01, 0.001, 0.0001]:
            # create and fit a ridge regression model, testing each alpha
            clf = LogisticRegression(
                C=C, penalty='l1', tol=0.001
            )  # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
            clf.fit(X_train, y_train)

            # Percentage of selected features
            num = len(clf.coef_[0].nonzero()[0])
            p = len(clf.coef_[0].nonzero()[0]) * 1.0 / len(X_train.columns)
            print '%s = 0, %s = 1' % tuple(clf.classes_)
            print 'C: ', C
            print 'Prediction accuracy: ', clf.score(X_test, y_test)
            print 'Features left (# / %): ', num, '/', p

            if C == 1:
                writer.write('%s = 0, %s = 1 \n' % tuple(clf.classes_))
            writer.write('C: %s \n' % C)
            writer.write('Accuracy: %s \n' % clf.score(X_test, y_test))
            writer.write('Features left (#/%%): %s / %s \n' % (num, p))

            # selected features
            if p < 0.5:
                idx = clf.coef_[0].nonzero()
                ws = clf.coef_[0][idx].round(3).astype(str)
                fs = X_train.columns[idx]
                tmp = fs + ' (' + ws + ')'
                print 'Selected features: %s' % ', '.join(tmp)
import time

import numpy as np

from sklearn.datasets import fetch_20newsgroups_vectorized
from lightning.classification import CDClassifier

bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target
y[y >= 1] = 1

Cs = np.logspace(-3, 3, 20)

for warm_start in (True, False):
    clf = CDClassifier(loss="squared_hinge", tol=1e-3, max_iter=100,
                       warm_start=warm_start)

    scores = []
    start = time.time()
    for C in Cs:
        clf.C = C
        clf.fit(X, y)
        scores.append(clf.score(X, y))

    print "Total time", time.time() - start
    print "Average accuracy", np.mean(scores)
#Source
#http://contrib.scikit-learn.org/lightning/

from sklearn.datasets import fetch_20newsgroups_vectorized
from lightning.classification import CDClassifier

# Load News20 dataset from scikit-learn.
bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target

# Set classifier options.
clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=20,
                   alpha=1e-4,
                   C=1.0 / X.shape[0],
                   tol=1e-3)

# Train the model.
clf.fit(X, y)

# Accuracy
print(clf.score(X, y))

# Percentage of selected features
print(clf.n_nonzero(percentage=True))
Beispiel #8
0
texts = [ " ".join(text) for text in texts]

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(texts)
y_train = labels

clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=15,
                   alpha=1e-4,
                   C=1.0 / X_train.shape[0],
                   tol=1e-6, verbose=5)


mmclf = mmclf.LatentGroupClassifier(max_iter=15, C=1.0 / X_train.shape[0])
start = time()
clf.fit(X_train, y_train)
elapsed = time() - start
print "CDClassifier time", elapsed
print "CDClassifier score", clf.score(X_train, y_train)
start = time()
mmclf.fit(X_train, y_train)
elapsed = time() - start
print "LatentGroupClassifier time", elapsed
print "LatentGroupClassifier score", mmclf.score(X_train, y_train)
print "CDClassifier weights\n", clf.coef_.T
print "LatentGroupClassifier weights\n", mmclf.coefs_.T
print "features", vectorizer.vocabulary_
top_words =  30

print "==== Keywords ==== "
for m in xrange(clf.coefs_.shape[1]):
	t = []
	print 'Topic',m
	for row in xrange(clf.coefs_.shape[0]):
		if( clf.coefs_[row,m] >0):
			t.append( dict_text[row])
	for k in heapq.nlargest(top_words,t):
		print k,
	print
 

print "==== Lightning Cython Implementation (Row-wise sparsity) ====="
light_clf = CDClassifier(penalty="l1/l2",
                         loss="squared_hinge",
                         multiclass=True,
                         max_iter=clf.max_iter,
                         alpha=1e-4, # clf.alpha,
                         C=1.0 / X.shape[0],
                         tol=clf.tol,
                         permute=False,
                         verbose=3,
                         random_state=0).fit(X, y)
print "==========>> Accuracy :", light_clf.score(X, y)
print "Weight Matrix:"
print (light_clf.coef_.T)


import time

import numpy as np

from sklearn.datasets import fetch_20newsgroups_vectorized
from lightning.classification import CDClassifier

# Load News20 dataset from scikit-learn.
bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target
y[y >= 1] = 1

for shrinking in (True, False):
    clf = CDClassifier(C=1.0, loss="squared_hinge", penalty="l1", tol=1e-3,
                       max_iter=1000, shrinking=shrinking, random_state=0)
    start = time.time()
    clf.fit(X, y)
    print "Training time", time.time() - start
    print "Accuracy", clf.score(X, y)