class Solver(BaseSolver): name = 'Lightning' install_cmd = 'conda' requirements = [ 'pip:git+https://github.com/scikit-learn-contrib/lightning.git' ] def set_objective(self, X, y, lmbd): self.X, self.y, self.lmbd = X, y, lmbd self.clf = CDClassifier(loss='log', penalty='l1', C=1, alpha=self.lmbd, tol=0, permute=False, shrinking=False, warm_start=False) def run(self, n_iter): self.clf.max_iter = n_iter self.clf.fit(self.X, self.y) def get_result(self): return self.clf.coef_.flatten()
def Light_lasso(X, y, alpha_): clf = CDClassifier( penalty="l1/l2", loss="squared_hinge", #multiclass=True, max_iter=50, alpha=alpha_, C=1.0 / X.shape[0], tol=1e-3) clf.fit(X, y) H1, H2 = np.nonzero(clf.coef_) X = X[:, H2] return X, H2
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0): ret = select_sets(df, sets) y = pd.DataFrame({"label":0}, index=df.index) for label, rows in enumerate(ret): y.loc[rows] = label + 1 y = y[y["label"] > 0] y -= 1 clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=len(sets) > 2, max_iter=20, alpha=alpha, C=1.0 / motifs.shape[0], tol=1e-3) accs = [] fractions = [] for i in np.arange(k): idx = np.random.choice(range(y.shape[0]), nsample, replace=True) y_pred = y.iloc[idx[:nsample * 0.8 + 1]] X_pred = motifs.loc[y_pred.index].values y_pred = y_pred.values.flatten() y_test = y.iloc[idx[nsample * 0.8 + 1:]] X_test = motifs.loc[y_test.index].values y_test = y_test.values.flatten() # train the model clf.fit(X_pred, y_pred) acc = clf.score(X_test, y_test) fraction = clf.n_nonzero(percentage=True) accs.append(acc) fractions.append(fraction) #print alpha, accs, fractions return alpha, np.median(accs), np.median(fractions)
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0): ret = select_sets(df, sets) y = pd.DataFrame({"label": 0}, index=df.index) for label, rows in enumerate(ret): y.loc[rows] = label + 1 y = y[y["label"] > 0] y -= 1 clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=len(sets) > 2, max_iter=20, alpha=alpha, C=1.0 / motifs.shape[0], tol=1e-3) accs = [] fractions = [] for i in np.arange(k): idx = np.random.choice(range(y.shape[0]), nsample, replace=True) y_pred = y.iloc[idx[:nsample * 0.8 + 1]] X_pred = motifs.loc[y_pred.index].values y_pred = y_pred.values.flatten() y_test = y.iloc[idx[nsample * 0.8 + 1:]] X_test = motifs.loc[y_test.index].values y_test = y_test.values.flatten() # train the model clf.fit(X_pred, y_pred) acc = clf.score(X_test, y_test) fraction = clf.n_nonzero(percentage=True) accs.append(acc) fractions.append(fraction) #print alpha, accs, fractions return alpha, np.median(accs), np.median(fractions)
def fit_model(data): X,y,multi,alpha, C = data #print "fitting {} {}".format(X.shape, y.shape) # Set classifier options. clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=multi, max_iter=20, alpha=alpha, C=C, tol=1e-3) # Train the model. return clf.fit(X, y)
def fit_model(data): X, y, multi, alpha, C = data #print "fitting {} {}".format(X.shape, y.shape) # Set classifier options. clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=multi, max_iter=20, alpha=alpha, C=C, tol=1e-3) # Train the model. return clf.fit(X, y)
from sklearn.metrics import f1_score import scattertext as st newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer() tfidf_X = vectorizer.fit_transform(newsgroups_train.data) clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=20, alpha=1e-4, C=1.0 / tfidf_X.shape[0], tol=1e-3) clf.fit(tfidf_X, newsgroups_train.target) corpus = st.CorpusFromScikit(X=CountVectorizer( vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data), y=newsgroups_train.target, feature_vocabulary=vectorizer.vocabulary_, category_names=newsgroups_train.target_names, raw_texts=newsgroups_train.data).build() html = st.produce_frequency_explorer( corpus, 'alt.atheism', scores=clf.coef_[0], use_term_significance=False, terms_to_include=st.AutoTermSelector.get_selected_terms( corpus, clf.coef_[0]),
import time import numpy as np from sklearn.datasets import fetch_20newsgroups_vectorized from lightning.classification import CDClassifier bunch = fetch_20newsgroups_vectorized(subset="all") X = bunch.data y = bunch.target y[y >= 1] = 1 Cs = np.logspace(-3, 3, 20) for warm_start in (True, False): clf = CDClassifier(loss="squared_hinge", tol=1e-3, max_iter=100, warm_start=warm_start) scores = [] start = time.time() for C in Cs: clf.C = C clf.fit(X, y) scores.append(clf.score(X, y)) print "Total time", time.time() - start print "Average accuracy", np.mean(scores)
scale = preprocessing.StandardScaler().fit(XtrainPos) XtrainPos = scale.fit_transform(XtrainPos) XtestPos = scale.fit_transform(XtestPos) #scale = preprocessing.MinMaxScaler() #XtrainPos = scale.fit_transform(XtrainPos) #XtestPos = scale.fit_transform(XtestPos) # scale = preprocessing.Normalizer().fit(XtrainPos) XtrainPos = scale.fit_transform(XtrainPos) XtestPos = scale.fit_transform(XtestPos) #classification clf = CDClassifier(penalty="l1/l2", loss="squared_hinge",multiclass=True,max_iter=20,C=1, alpha=1e-4,tol=1e-3) #clf = LinearSVC(penalty="l2") clf = clf.fit(XtrainPos, YtrainPos) print(metrics.classification_report(YtestPos, clf.predict(XtestPos))) ## Crossvalidation 5 times using different split #scores = cross_validation.cross_val_score(clf_svm, posfeat, label, cv=5, scoring='f1') #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # Visualization #plt.hist(XtrainPos[:,0]) #plt.show()
X = tmp.drop(['type', 'rating'], axis=1) # split data X_train, X_test, y_train, y_test = train_test_split(X, y_type, test_size=0.1, random_state=42) # build model # cross validation for C in [1, 0.1, 0.01, 0.001, 0.0001]: # create and fit a ridge regression model, testing each alpha clf = LogisticRegression( C=C, penalty='l1', tol=0.001 ) # Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. clf.fit(X_train, y_train) # Percentage of selected features num = len(clf.coef_[0].nonzero()[0]) p = len(clf.coef_[0].nonzero()[0]) * 1.0 / len(X_train.columns) print '%s = 0, %s = 1' % tuple(clf.classes_) print 'C: ', C print 'Prediction accuracy: ', clf.score(X_test, y_test) print 'Features left (# / %): ', num, '/', p if C == 1: writer.write('%s = 0, %s = 1 \n' % tuple(clf.classes_)) writer.write('C: %s \n' % C) writer.write('Accuracy: %s \n' % clf.score(X_test, y_test)) writer.write('Features left (#/%%): %s / %s \n' % (num, p))
print "Acc:", clf.score(X, y) print clf.coefs_ print "### Equivalent Lightning Cython Implementation ###" light_clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=clf.max_iter, alpha=1e-4, # clf.alpha, C=1.0 / X.shape[0], tol=clf.tol, permute=False, verbose=3, random_state=0).fit(X, y) print "Acc:", light_clf.score(X, y) print light_clf.coef_.T import numpy as np data = np.load('3ng_train.npz') X = data['X'].item() Xaug = data['Xaug'].item() y = data['y'] groups = data['groups'] clf.fit(Xaug, y, groups) print clf.score(Xaug, y) light_clf.verbose=1 light_clf.fit(X, y) print light_clf.score(X, y)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics import f1_score import scattertext as st newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) vectorizer = TfidfVectorizer() tfidf_X = vectorizer.fit_transform(newsgroups_train.data) clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=20, alpha=1e-4, C=1.0 / tfidf_X.shape[0], tol=1e-3) clf.fit(tfidf_X, newsgroups_train.target) corpus = st.CorpusFromScikit( X=CountVectorizer(vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data), y=newsgroups_train.target, feature_vocabulary=vectorizer.vocabulary_, category_names=newsgroups_train.target_names, raw_texts=newsgroups_train.data ).build() html = st.produce_frequency_explorer( corpus, 'alt.atheism', scores=clf.coef_[0], use_term_significance=False, terms_to_include=st.AutoTermSelector.get_selected_terms(corpus, clf.coef_[0]),
texts = [ " ".join(text) for text in texts] vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(texts) y_train = labels clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=15, alpha=1e-4, C=1.0 / X_train.shape[0], tol=1e-6, verbose=5) mmclf = mmclf.LatentGroupClassifier(max_iter=15, C=1.0 / X_train.shape[0]) start = time() clf.fit(X_train, y_train) elapsed = time() - start print "CDClassifier time", elapsed print "CDClassifier score", clf.score(X_train, y_train) start = time() mmclf.fit(X_train, y_train) elapsed = time() - start print "LatentGroupClassifier time", elapsed print "LatentGroupClassifier score", mmclf.score(X_train, y_train) print "CDClassifier weights\n", clf.coef_.T print "LatentGroupClassifier weights\n", mmclf.coefs_.T print "features", vectorizer.vocabulary_