Beispiel #1
0
class Solver(BaseSolver):
    name = 'Lightning'

    install_cmd = 'conda'
    requirements = [
        'pip:git+https://github.com/scikit-learn-contrib/lightning.git'
    ]

    def set_objective(self, X, y, lmbd):

        self.X, self.y, self.lmbd = X, y, lmbd

        self.clf = CDClassifier(loss='log',
                                penalty='l1',
                                C=1,
                                alpha=self.lmbd,
                                tol=0,
                                permute=False,
                                shrinking=False,
                                warm_start=False)

    def run(self, n_iter):
        self.clf.max_iter = n_iter
        self.clf.fit(self.X, self.y)

    def get_result(self):
        return self.clf.coef_.flatten()
Beispiel #2
0
def Light_lasso(X, y, alpha_):
    clf = CDClassifier(
        penalty="l1/l2",
        loss="squared_hinge",
        #multiclass=True,
        max_iter=50,
        alpha=alpha_,
        C=1.0 / X.shape[0],
        tol=1e-3)
    clf.fit(X, y)
    H1, H2 = np.nonzero(clf.coef_)
    X = X[:, H2]
    return X, H2
Beispiel #3
0
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0):
    ret = select_sets(df, sets)
    y = pd.DataFrame({"label":0}, index=df.index)
    for label, rows in enumerate(ret):
        y.loc[rows] = label + 1
    y = y[y["label"] > 0]
    y -= 1

    clf = CDClassifier(penalty="l1/l2",
                       loss="squared_hinge",
                       multiclass=len(sets) > 2,
                       max_iter=20,
                       alpha=alpha,
                       C=1.0 / motifs.shape[0],
                       tol=1e-3)

    accs = []
    fractions = []

    for i in np.arange(k):

        idx = np.random.choice(range(y.shape[0]), nsample, replace=True)

        y_pred = y.iloc[idx[:nsample * 0.8 + 1]]
        X_pred = motifs.loc[y_pred.index].values
        y_pred = y_pred.values.flatten()

        y_test = y.iloc[idx[nsample * 0.8 + 1:]]
        X_test = motifs.loc[y_test.index].values
        y_test = y_test.values.flatten()

        # train the model
        clf.fit(X_pred, y_pred)

        acc = clf.score(X_test, y_test)
        fraction = clf.n_nonzero(percentage=True)

        accs.append(acc)
        fractions.append(fraction)

    #print alpha, accs, fractions
    return alpha, np.median(accs), np.median(fractions)
Beispiel #4
0
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0):
    ret = select_sets(df, sets)
    y = pd.DataFrame({"label": 0}, index=df.index)
    for label, rows in enumerate(ret):
        y.loc[rows] = label + 1
    y = y[y["label"] > 0]
    y -= 1

    clf = CDClassifier(penalty="l1/l2",
                       loss="squared_hinge",
                       multiclass=len(sets) > 2,
                       max_iter=20,
                       alpha=alpha,
                       C=1.0 / motifs.shape[0],
                       tol=1e-3)

    accs = []
    fractions = []

    for i in np.arange(k):

        idx = np.random.choice(range(y.shape[0]), nsample, replace=True)

        y_pred = y.iloc[idx[:nsample * 0.8 + 1]]
        X_pred = motifs.loc[y_pred.index].values
        y_pred = y_pred.values.flatten()

        y_test = y.iloc[idx[nsample * 0.8 + 1:]]
        X_test = motifs.loc[y_test.index].values
        y_test = y_test.values.flatten()

        # train the model
        clf.fit(X_pred, y_pred)

        acc = clf.score(X_test, y_test)
        fraction = clf.n_nonzero(percentage=True)

        accs.append(acc)
        fractions.append(fraction)

    #print alpha, accs, fractions
    return alpha, np.median(accs), np.median(fractions)
Beispiel #5
0
def fit_model(data):
    X,y,multi,alpha, C = data
    #print "fitting {} {}".format(X.shape, y.shape)
    # Set classifier options.
    clf = CDClassifier(penalty="l1/l2",
                       loss="squared_hinge",
                       multiclass=multi,
                       max_iter=20,
                       alpha=alpha,
                       C=C,
                       tol=1e-3)

    # Train the model.
    return clf.fit(X, y)
Beispiel #6
0
def fit_model(data):
    X, y, multi, alpha, C = data
    #print "fitting {} {}".format(X.shape, y.shape)
    # Set classifier options.
    clf = CDClassifier(penalty="l1/l2",
                       loss="squared_hinge",
                       multiclass=multi,
                       max_iter=20,
                       alpha=alpha,
                       C=C,
                       tol=1e-3)

    # Train the model.
    return clf.fit(X, y)
Beispiel #7
0
from sklearn.metrics import f1_score
import scattertext as st

newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'))

vectorizer = TfidfVectorizer()
tfidf_X = vectorizer.fit_transform(newsgroups_train.data)
clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=20,
                   alpha=1e-4,
                   C=1.0 / tfidf_X.shape[0],
                   tol=1e-3)
clf.fit(tfidf_X, newsgroups_train.target)

corpus = st.CorpusFromScikit(X=CountVectorizer(
    vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data),
                             y=newsgroups_train.target,
                             feature_vocabulary=vectorizer.vocabulary_,
                             category_names=newsgroups_train.target_names,
                             raw_texts=newsgroups_train.data).build()

html = st.produce_frequency_explorer(
    corpus,
    'alt.atheism',
    scores=clf.coef_[0],
    use_term_significance=False,
    terms_to_include=st.AutoTermSelector.get_selected_terms(
        corpus, clf.coef_[0]),
import time

import numpy as np

from sklearn.datasets import fetch_20newsgroups_vectorized
from lightning.classification import CDClassifier

bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target
y[y >= 1] = 1

Cs = np.logspace(-3, 3, 20)

for warm_start in (True, False):
    clf = CDClassifier(loss="squared_hinge",
                       tol=1e-3,
                       max_iter=100,
                       warm_start=warm_start)

    scores = []
    start = time.time()
    for C in Cs:
        clf.C = C
        clf.fit(X, y)
        scores.append(clf.score(X, y))

    print "Total time", time.time() - start
    print "Average accuracy", np.mean(scores)
Beispiel #9
0
scale = preprocessing.StandardScaler().fit(XtrainPos)
XtrainPos = scale.fit_transform(XtrainPos)
XtestPos = scale.fit_transform(XtestPos)

#scale = preprocessing.MinMaxScaler()
#XtrainPos = scale.fit_transform(XtrainPos)
#XtestPos = scale.fit_transform(XtestPos)
#
scale = preprocessing.Normalizer().fit(XtrainPos)
XtrainPos = scale.fit_transform(XtrainPos)
XtestPos = scale.fit_transform(XtestPos)

#classification
clf = CDClassifier(penalty="l1/l2", loss="squared_hinge",multiclass=True,max_iter=20,C=1,
                   alpha=1e-4,tol=1e-3)

#clf = LinearSVC(penalty="l2")
clf = clf.fit(XtrainPos, YtrainPos)
print(metrics.classification_report(YtestPos, clf.predict(XtestPos)))

## Crossvalidation 5 times using different split
#scores = cross_validation.cross_val_score(clf_svm, posfeat, label, cv=5, scoring='f1')
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Visualization
#plt.hist(XtrainPos[:,0])
#plt.show()



        X = tmp.drop(['type', 'rating'], axis=1)

        # split data
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y_type,
                                                            test_size=0.1,
                                                            random_state=42)

        # build model
        # cross validation
        for C in [1, 0.1, 0.01, 0.001, 0.0001]:
            # create and fit a ridge regression model, testing each alpha
            clf = LogisticRegression(
                C=C, penalty='l1', tol=0.001
            )  # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization.
            clf.fit(X_train, y_train)

            # Percentage of selected features
            num = len(clf.coef_[0].nonzero()[0])
            p = len(clf.coef_[0].nonzero()[0]) * 1.0 / len(X_train.columns)
            print '%s = 0, %s = 1' % tuple(clf.classes_)
            print 'C: ', C
            print 'Prediction accuracy: ', clf.score(X_test, y_test)
            print 'Features left (# / %): ', num, '/', p

            if C == 1:
                writer.write('%s = 0, %s = 1 \n' % tuple(clf.classes_))
            writer.write('C: %s \n' % C)
            writer.write('Accuracy: %s \n' % clf.score(X_test, y_test))
            writer.write('Features left (#/%%): %s / %s \n' % (num, p))
import time

import numpy as np

from sklearn.datasets import fetch_20newsgroups_vectorized
from lightning.classification import CDClassifier

bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target
y[y >= 1] = 1

Cs = np.logspace(-3, 3, 20)

for warm_start in (True, False):
    clf = CDClassifier(loss="squared_hinge", tol=1e-3, max_iter=100,
                       warm_start=warm_start)

    scores = []
    start = time.time()
    for C in Cs:
        clf.C = C
        clf.fit(X, y)
        scores.append(clf.score(X, y))

    print "Total time", time.time() - start
    print "Average accuracy", np.mean(scores)
Beispiel #12
0
print "Acc:", clf.score(X, y)
print clf.coefs_

print "### Equivalent Lightning Cython Implementation ###"
light_clf = CDClassifier(penalty="l1/l2",
                         loss="squared_hinge",
                         multiclass=True,
                         max_iter=clf.max_iter,
                         alpha=1e-4, # clf.alpha,
                         C=1.0 / X.shape[0],
                         tol=clf.tol,
                         permute=False,
                         verbose=3,
                         random_state=0).fit(X, y)
print "Acc:", light_clf.score(X, y)
print light_clf.coef_.T

import numpy as np
data = np.load('3ng_train.npz')
X = data['X'].item()
Xaug = data['Xaug'].item()
y = data['y']
groups = data['groups']
clf.fit(Xaug, y, groups)
print clf.score(Xaug, y)

light_clf.verbose=1
light_clf.fit(X, y)

print light_clf.score(X, y)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
import scattertext as st

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

vectorizer = TfidfVectorizer()
tfidf_X = vectorizer.fit_transform(newsgroups_train.data)
clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=20,
                   alpha=1e-4,
                   C=1.0 / tfidf_X.shape[0],
                   tol=1e-3)
clf.fit(tfidf_X, newsgroups_train.target)

corpus = st.CorpusFromScikit(
	X=CountVectorizer(vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data),
	y=newsgroups_train.target,
	feature_vocabulary=vectorizer.vocabulary_,
	category_names=newsgroups_train.target_names,
	raw_texts=newsgroups_train.data
).build()

html = st.produce_frequency_explorer(
	corpus,
	'alt.atheism',
	scores=clf.coef_[0],
	use_term_significance=False,
	terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, clf.coef_[0]),
Beispiel #14
0
texts = [ " ".join(text) for text in texts]

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(texts)
y_train = labels

clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=15,
                   alpha=1e-4,
                   C=1.0 / X_train.shape[0],
                   tol=1e-6, verbose=5)


mmclf = mmclf.LatentGroupClassifier(max_iter=15, C=1.0 / X_train.shape[0])
start = time()
clf.fit(X_train, y_train)
elapsed = time() - start
print "CDClassifier time", elapsed
print "CDClassifier score", clf.score(X_train, y_train)
start = time()
mmclf.fit(X_train, y_train)
elapsed = time() - start
print "LatentGroupClassifier time", elapsed
print "LatentGroupClassifier score", mmclf.score(X_train, y_train)
print "CDClassifier weights\n", clf.coef_.T
print "LatentGroupClassifier weights\n", mmclf.coefs_.T
print "features", vectorizer.vocabulary_