def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0): ret = select_sets(df, sets) y = pd.DataFrame({"label":0}, index=df.index) for label, rows in enumerate(ret): y.loc[rows] = label + 1 y = y[y["label"] > 0] y -= 1 clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=len(sets) > 2, max_iter=20, alpha=alpha, C=1.0 / motifs.shape[0], tol=1e-3) accs = [] fractions = [] for i in np.arange(k): idx = np.random.choice(range(y.shape[0]), nsample, replace=True) y_pred = y.iloc[idx[:nsample * 0.8 + 1]] X_pred = motifs.loc[y_pred.index].values y_pred = y_pred.values.flatten() y_test = y.iloc[idx[nsample * 0.8 + 1:]] X_test = motifs.loc[y_test.index].values y_test = y_test.values.flatten() # train the model clf.fit(X_pred, y_pred) acc = clf.score(X_test, y_test) fraction = clf.n_nonzero(percentage=True) accs.append(acc) fractions.append(fraction) #print alpha, accs, fractions return alpha, np.median(accs), np.median(fractions)
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0): ret = select_sets(df, sets) y = pd.DataFrame({"label": 0}, index=df.index) for label, rows in enumerate(ret): y.loc[rows] = label + 1 y = y[y["label"] > 0] y -= 1 clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=len(sets) > 2, max_iter=20, alpha=alpha, C=1.0 / motifs.shape[0], tol=1e-3) accs = [] fractions = [] for i in np.arange(k): idx = np.random.choice(range(y.shape[0]), nsample, replace=True) y_pred = y.iloc[idx[:nsample * 0.8 + 1]] X_pred = motifs.loc[y_pred.index].values y_pred = y_pred.values.flatten() y_test = y.iloc[idx[nsample * 0.8 + 1:]] X_test = motifs.loc[y_test.index].values y_test = y_test.values.flatten() # train the model clf.fit(X_pred, y_pred) acc = clf.score(X_test, y_test) fraction = clf.n_nonzero(percentage=True) accs.append(acc) fractions.append(fraction) #print alpha, accs, fractions return alpha, np.median(accs), np.median(fractions)
#Source #http://contrib.scikit-learn.org/lightning/ from sklearn.datasets import fetch_20newsgroups_vectorized from lightning.classification import CDClassifier # Load News20 dataset from scikit-learn. bunch = fetch_20newsgroups_vectorized(subset="all") X = bunch.data y = bunch.target # Set classifier options. clf = CDClassifier(penalty="l1/l2", loss="squared_hinge", multiclass=True, max_iter=20, alpha=1e-4, C=1.0 / X.shape[0], tol=1e-3) # Train the model. clf.fit(X, y) # Accuracy print(clf.score(X, y)) # Percentage of selected features print(clf.n_nonzero(percentage=True))