Esempio n. 1
0
class Solver(BaseSolver):
    name = 'Lightning'

    install_cmd = 'conda'
    requirements = [
        'pip:git+https://github.com/scikit-learn-contrib/lightning.git'
    ]

    def set_objective(self, X, y, lmbd):

        self.X, self.y, self.lmbd = X, y, lmbd

        self.clf = CDClassifier(loss='log',
                                penalty='l1',
                                C=1,
                                alpha=self.lmbd,
                                tol=0,
                                permute=False,
                                shrinking=False,
                                warm_start=False)

    def run(self, n_iter):
        self.clf.max_iter = n_iter
        self.clf.fit(self.X, self.y)

    def get_result(self):
        return self.clf.coef_.flatten()
Esempio n. 2
0
def createLightningClassification(params = None): 
    ## Params
    lParams   = CDClassifier().get_params()
    if params is None:
        params = lParams

    C           = getParams('C', float, None, params, lParams)
    Cd          = getParams('Cd', float, None, params, lParams)
    alpha       = getParams('alpha', float, None, params, lParams)
    beta        = getParams('beta', float, None, params, lParams)
    loss        = getParams('loss', str, ['squared_hinge'], params, lParams)
    max_iter    = getParams('max_iter', int, None, params, lParams)
    max_steps   = getParams('max_steps', str, ['auto'], params, lParams)
    n_calls     = getParams('n_calls', int, None, params, lParams)
    n_jobs      = getParams('n_jobs', int, None, params, lParams)
    penalty     = getParams('penalty', str, ['l1', 'l2', 'l1/l2'], params, lParams)
    sigma       = getParams('sigma', float, None, params, lParams)
    termination = getParams('termination', str, ['violation_max', 'violation_sum'], params, lParams)
    tol         = getParams('tol', float, None, params, lParams)
 
 
        
    ## Estimator
    clf = CDClassifier(C=C, Cd=Cd, alpha=alpha, beta=beta,
                       loss=loss, max_iter=max_iter,
                       max_steps=max_steps, n_calls=n_calls,
                       n_jobs=n_jobs, penalty=penalty,
                       sigma=sigma, termination=termination, tol=tol)
    
    return clf
 
#
## Load News20 dataset from scikit-learn.
#bunch = fetch_20newsgroups_vectorized(subset="all")
#X = bunch.data
#y = bunch.target
#
## Set classifier options.
#clf = CDClassifier(penalty="l1/l2",
#                   loss="squared_hinge",
#                   multiclass=True,
#                   max_iter=20,
#                   alpha=1e-4,
#                   C=1.0 / X.shape[0],
#                   tol=1e-3)
#
## Train the model.
#clf.fit(X, y)
#
## Accuracy
#print(clf.score(X, y))
#
## Percentage of selected features
#print(clf.n_nonzero(percentage=True))
Esempio n. 3
0
    def set_objective(self, X, y, lmbd):

        self.X, self.y, self.lmbd = X, y, lmbd

        self.clf = CDClassifier(loss='log',
                                penalty='l1',
                                C=1,
                                alpha=self.lmbd,
                                tol=0,
                                permute=False,
                                shrinking=False,
                                warm_start=False)
Esempio n. 4
0
def Light_lasso(X, y, alpha_):
    clf = CDClassifier(
        penalty="l1/l2",
        loss="squared_hinge",
        #multiclass=True,
        max_iter=50,
        alpha=alpha_,
        C=1.0 / X.shape[0],
        tol=1e-3)
    clf.fit(X, y)
    H1, H2 = np.nonzero(clf.coef_)
    X = X[:, H2]
    return X, H2
Esempio n. 5
0
def fit_model(data):
    X, y, multi, alpha, C = data
    #print "fitting {} {}".format(X.shape, y.shape)
    # Set classifier options.
    clf = CDClassifier(penalty="l1/l2",
                       loss="squared_hinge",
                       multiclass=multi,
                       max_iter=20,
                       alpha=alpha,
                       C=C,
                       tol=1e-3)

    # Train the model.
    return clf.fit(X, y)
Esempio n. 6
0
def fit_model(data):
    X,y,multi,alpha, C = data
    #print "fitting {} {}".format(X.shape, y.shape)
    # Set classifier options.
    clf = CDClassifier(penalty="l1/l2",
                       loss="squared_hinge",
                       multiclass=multi,
                       max_iter=20,
                       alpha=alpha,
                       C=C,
                       tol=1e-3)

    # Train the model.
    return clf.fit(X, y)
Esempio n. 7
0
    def __init__(self, scale=True, permute=False, ncpus=None):
        """Predict motif activities using lightning CDClassifier 

        Parameters
        ----------
        scale : boolean, optional, default True
            If ``True``, the motif scores will be scaled 
            before classification
        
        ncpus : int, optional
            Number of threads. Default is the number specified in the config.
       
        Attributes
        ----------
        act_ : DataFrame, shape (n_motifs, n_clusters)
            fitted coefficients

        sig_ : DataFrame, shape (n_motifs,)
            boolean values, if coefficients are higher/lower than
            the 1%t from random permutation
        """

        self.act_description = ("activity values: coefficients from "
                                "fitted model")

        #self.cdc = CDClassifier(random_state=args.seed)
        self.cdc = CDClassifier()

        self.parameters = {
            "penalty": ["l1/l2"],
            "loss": ["squared_hinge"],
            "multiclass": [True],
            "max_iter": [20],
            "alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 3.0)],
            "C": [0.001, 0.01, 0.1, 0.5, 1.0],
            "tol": [1e-3]
        }

        self.kfolds = 10

        if ncpus is None:
            ncpus = int(MotifConfig().get_default_params().get("ncpus", 2))

        self.clf = GridSearchCV(self.cdc,
                                self.parameters,
                                cv=self.kfolds,
                                n_jobs=ncpus)

        self.scale = scale
        self.permute = permute

        self.act_ = None
        self.sig_ = None
        self.pref_table = "score"
        self.supported_tables = ["score", "count"]
        self.ptype = "classification"
Esempio n. 8
0
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0):
    ret = select_sets(df, sets)
    y = pd.DataFrame({"label":0}, index=df.index)
    for label, rows in enumerate(ret):
        y.loc[rows] = label + 1
    y = y[y["label"] > 0]
    y -= 1

    clf = CDClassifier(penalty="l1/l2",
                       loss="squared_hinge",
                       multiclass=len(sets) > 2,
                       max_iter=20,
                       alpha=alpha,
                       C=1.0 / motifs.shape[0],
                       tol=1e-3)

    accs = []
    fractions = []

    for i in np.arange(k):

        idx = np.random.choice(range(y.shape[0]), nsample, replace=True)

        y_pred = y.iloc[idx[:nsample * 0.8 + 1]]
        X_pred = motifs.loc[y_pred.index].values
        y_pred = y_pred.values.flatten()

        y_test = y.iloc[idx[nsample * 0.8 + 1:]]
        X_test = motifs.loc[y_test.index].values
        y_test = y_test.values.flatten()

        # train the model
        clf.fit(X_pred, y_pred)

        acc = clf.score(X_test, y_test)
        fraction = clf.n_nonzero(percentage=True)

        accs.append(acc)
        fractions.append(fraction)

    #print alpha, accs, fractions
    return alpha, np.median(accs), np.median(fractions)
Esempio n. 9
0
    def __init__(self, scale=True):
        """Predict motif activities using lighting CDClassifier

        Parameters
        ----------
        scale : boolean, optional, default True
            If ``True``, the motif scores will be scaled 
            before classification
       
        Attributes
        ----------
        act_ : DataFrame, shape (n_motifs, n_clusters)
            fitted coefficients

        sig_ : DataFrame, shape (n_motifs,)
            boolean values, if coefficients are higher/lower than
            the 1%t from random permutation
        """

        self.act_description = ("activity values: coefficients from "
                                "fitted model")

        #self.cdc = CDClassifier(random_state=args.seed)
        self.cdc = CDClassifier()

        self.parameters = {
            "penalty": ["l1/l2"],
            "loss": ["squared_hinge"],
            "multiclass": [True],
            "max_iter": [20],
            "alpha": [np.exp(-x) for x in np.arange(0, 10, 1 / 3.0)],
            "C": [0.001, 0.01, 0.1, 0.5, 1.0],
            "tol": [1e-3]
        }

        self.kfolds = 10

        self.clf = GridSearchCV(self.cdc,
                                self.parameters,
                                cv=self.kfolds,
                                n_jobs=-1)

        self.scale = scale

        self.act_ = None
        self.sig_ = None
Esempio n. 10
0
def eval_model(df, sets, motifs, alpha, nsample=1000, k=10, cutoff=0):
    ret = select_sets(df, sets)
    y = pd.DataFrame({"label": 0}, index=df.index)
    for label, rows in enumerate(ret):
        y.loc[rows] = label + 1
    y = y[y["label"] > 0]
    y -= 1

    clf = CDClassifier(penalty="l1/l2",
                       loss="squared_hinge",
                       multiclass=len(sets) > 2,
                       max_iter=20,
                       alpha=alpha,
                       C=1.0 / motifs.shape[0],
                       tol=1e-3)

    accs = []
    fractions = []

    for i in np.arange(k):

        idx = np.random.choice(range(y.shape[0]), nsample, replace=True)

        y_pred = y.iloc[idx[:nsample * 0.8 + 1]]
        X_pred = motifs.loc[y_pred.index].values
        y_pred = y_pred.values.flatten()

        y_test = y.iloc[idx[nsample * 0.8 + 1:]]
        X_test = motifs.loc[y_test.index].values
        y_test = y_test.values.flatten()

        # train the model
        clf.fit(X_pred, y_pred)

        acc = clf.score(X_test, y_test)
        fraction = clf.n_nonzero(percentage=True)

        accs.append(acc)
        fractions.append(fraction)

    #print alpha, accs, fractions
    return alpha, np.median(accs), np.median(fractions)
Esempio n. 11
0
import time

import numpy as np

from sklearn.datasets import fetch_20newsgroups_vectorized
from lightning.classification import CDClassifier

bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target
y[y >= 1] = 1

Cs = np.logspace(-3, 3, 20)

for warm_start in (True, False):
    clf = CDClassifier(loss="squared_hinge", tol=1e-3, max_iter=100,
                       warm_start=warm_start)

    scores = []
    start = time.time()
    for C in Cs:
        clf.C = C
        clf.fit(X, y)
        scores.append(clf.score(X, y))

    print "Total time", time.time() - start
    print "Average accuracy", np.mean(scores)
Esempio n. 12
0
one_over_n = 1. / float(n_samples)
ds = ColumnData(X)
coefs_ = np.zeros((n_features, n_classes))

fit( ds, y, one_over_n, n_samples, n_features, n_classes,coefs_,groups)
s =  score (X,y,coefs_)
print "score = ", s

print '======================================================'

clf_max_iter=300
clf_tol = 1e-3
print "### Equivalent Lightning Cython Implementation ###"
light_clf = CDClassifier(penalty="l1/l2",
                         loss="squared_hinge",
                         multiclass=True,
                         max_iter=clf_max_iter,
                         alpha=0.5, # clf.alpha,
                         C=1.0 / X.shape[0],
                         tol=clf_tol,
                         permute=False,
                         verbose=3,
                         random_state=0).fit(X, y)
print "Acc:", light_clf.score(X, y)
print light_clf.coef_.T





Esempio n. 13
0
from lightning.classification import CDClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
import scattertext as st

newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'))

vectorizer = TfidfVectorizer()
tfidf_X = vectorizer.fit_transform(newsgroups_train.data)
clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=20,
                   alpha=1e-4,
                   C=1.0 / tfidf_X.shape[0],
                   tol=1e-3)
clf.fit(tfidf_X, newsgroups_train.target)

corpus = st.CorpusFromScikit(X=CountVectorizer(
    vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data),
                             y=newsgroups_train.target,
                             feature_vocabulary=vectorizer.vocabulary_,
                             category_names=newsgroups_train.target_names,
                             raw_texts=newsgroups_train.data).build()

html = st.produce_frequency_explorer(
    corpus,
    'alt.atheism',
    scores=clf.coef_[0],
Esempio n. 14
0
bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target

# Select a subset of the classes for faster training.
ind = np.arange(X.shape[0])
subset = y < 5
X = X[ind[subset]]
y = y[subset]

# Train / test split.
X_tr, X_te, y_tr, y_te = train_test_split(X,
                                          y,
                                          train_size=0.75,
                                          test_size=0.25,
                                          random_state=0)

clfs = (CDClassifier(loss="squared_hinge",
                     penalty="l2",
                     max_iter=20,
                     random_state=0), LinearSVC(max_iter=20, random_state=0),
        SGDClassifier(learning_rate="constant",
                      alpha=1e-3,
                      max_iter=20,
                      random_state=0))

for clf in clfs:
    print(clf.__class__.__name__)
    clf.fit(X_tr, y_tr)
    print(clf.score(X_te, y_te))
Esempio n. 15
0
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from lightning.classification import CDClassifier
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_set_y = np.asarray(le.fit_transform(train_set_y), dtype='int32')
valid_set_y = np.asarray(le.transform(valid_set_y), dtype='int32')
test_set_y = np.asarray(le.transform(test_set_y), dtype='int32')

clfs = [  #SGDClassifier(loss='hinge', penalty='l2'),
    "LogisticRegression",
    CDClassifier(penalty="l1/l2",
                 loss="squared_hinge",
                 multiclass=True,
                 max_iter=20,
                 alpha=1e-4,
                 C=1.0 / train_set_x.shape[0],
                 tol=1e-3),
    CDClassifier(penalty="l1/l2",
                 loss="log",
                 multiclass=True,
                 max_iter=20,
                 alpha=1e-4,
                 C=1.0 / train_set_x.shape[0],
                 tol=1e-3),
    #svm.LinearSVC(),
    svm.SVC(kernel='rbf', cache_size=8000, max_iter=20)
]
for clf in clfs:
    print clf
Esempio n. 16
0
import time

import numpy as np

from sklearn.datasets import fetch_20newsgroups_vectorized
from lightning.classification import CDClassifier

bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target
y[y >= 1] = 1

Cs = np.logspace(-3, 3, 20)

for warm_start in (True, False):
    clf = CDClassifier(loss="squared_hinge",
                       tol=1e-3,
                       max_iter=100,
                       warm_start=warm_start)

    scores = []
    start = time.time()
    for C in Cs:
        clf.C = C
        clf.fit(X, y)
        scores.append(clf.score(X, y))

    print "Total time", time.time() - start
    print "Average accuracy", np.mean(scores)
Esempio n. 17
0
#normalization of features
scale = preprocessing.StandardScaler().fit(XtrainPos)
XtrainPos = scale.fit_transform(XtrainPos)
XtestPos = scale.fit_transform(XtestPos)

#scale = preprocessing.MinMaxScaler()
#XtrainPos = scale.fit_transform(XtrainPos)
#XtestPos = scale.fit_transform(XtestPos)
#
scale = preprocessing.Normalizer().fit(XtrainPos)
XtrainPos = scale.fit_transform(XtrainPos)
XtestPos = scale.fit_transform(XtestPos)

#classification
clf = CDClassifier(penalty="l1/l2", loss="squared_hinge",multiclass=True,max_iter=20,C=1,
                   alpha=1e-4,tol=1e-3)

#clf = LinearSVC(penalty="l2")
clf = clf.fit(XtrainPos, YtrainPos)
print(metrics.classification_report(YtestPos, clf.predict(XtestPos)))

## Crossvalidation 5 times using different split
#scores = cross_validation.cross_val_score(clf_svm, posfeat, label, cv=5, scoring='f1')
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Visualization
#plt.hist(XtrainPos[:,0])
#plt.show()


Esempio n. 18
0
                      alpha=alpha,
                      eta=eta_svrg,
                      n_inner=1.0,
                      max_iter=100,
                      random_state=0,
                      tol=1e-24)
clf2 = SDCAClassifier(loss="squared_hinge",
                      alpha=alpha,
                      max_iter=100,
                      n_calls=X.shape[0] / 2,
                      random_state=0,
                      tol=tol)
clf3 = CDClassifier(loss="squared_hinge",
                    alpha=alpha,
                    C=1.0 / X.shape[0],
                    max_iter=50,
                    n_calls=X.shape[1] / 3,
                    random_state=0,
                    tol=tol)
clf4 = AdaGradClassifier(loss="squared_hinge",
                         alpha=alpha,
                         eta=eta_adagrad,
                         n_iter=100,
                         n_calls=X.shape[0] / 2,
                         random_state=0)
clf5 = SAGAClassifier(loss="squared_hinge",
                      alpha=alpha,
                      max_iter=100,
                      random_state=0,
                      tol=tol)
clf6 = SAGClassifier(loss="squared_hinge",
import time

import numpy as np

from sklearn.datasets import fetch_20newsgroups_vectorized
from lightning.classification import CDClassifier

# Load News20 dataset from scikit-learn.
bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target
y[y >= 1] = 1

for shrinking in (True, False):
    clf = CDClassifier(C=1.0, loss="squared_hinge", penalty="l1", tol=1e-3,
                       max_iter=1000, shrinking=shrinking, random_state=0)
    start = time.time()
    clf.fit(X, y)
    print "Training time", time.time() - start
    print "Accuracy", clf.score(X, y)
Esempio n. 20
0
top_words =  30

print "==== Keywords ==== "
for m in xrange(clf.coefs_.shape[1]):
	t = []
	print 'Topic',m
	for row in xrange(clf.coefs_.shape[0]):
		if( clf.coefs_[row,m] >0):
			t.append( dict_text[row])
	for k in heapq.nlargest(top_words,t):
		print k,
	print
 

print "==== Lightning Cython Implementation (Row-wise sparsity) ====="
light_clf = CDClassifier(penalty="l1/l2",
                         loss="squared_hinge",
                         multiclass=True,
                         max_iter=clf.max_iter,
                         alpha=1e-4, # clf.alpha,
                         C=1.0 / X.shape[0],
                         tol=clf.tol,
                         permute=False,
                         verbose=3,
                         random_state=0).fit(X, y)
print "==========>> Accuracy :", light_clf.score(X, y)
print "Weight Matrix:"
print (light_clf.coef_.T)


Esempio n. 21
0
# remove words that appear only once
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once]
for text in texts]

texts = [ " ".join(text) for text in texts]

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(texts)
y_train = labels

clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=15,
                   alpha=1e-4,
                   C=1.0 / X_train.shape[0],
                   tol=1e-6, verbose=5)


mmclf = mmclf.LatentGroupClassifier(max_iter=15, C=1.0 / X_train.shape[0])
start = time()
clf.fit(X_train, y_train)
elapsed = time() - start
print "CDClassifier time", elapsed
print "CDClassifier score", clf.score(X_train, y_train)
start = time()
mmclf.fit(X_train, y_train)
elapsed = time() - start
print "LatentGroupClassifier time", elapsed
#Source
#http://contrib.scikit-learn.org/lightning/

from sklearn.datasets import fetch_20newsgroups_vectorized
from lightning.classification import CDClassifier

# Load News20 dataset from scikit-learn.
bunch = fetch_20newsgroups_vectorized(subset="all")
X = bunch.data
y = bunch.target

# Set classifier options.
clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=20,
                   alpha=1e-4,
                   C=1.0 / X.shape[0],
                   tol=1e-3)

# Train the model.
clf.fit(X, y)

# Accuracy
print(clf.score(X, y))

# Percentage of selected features
print(clf.n_nonzero(percentage=True))
Esempio n. 23
0
from lightning.classification import CDClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
import scattertext as st

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

vectorizer = TfidfVectorizer()
tfidf_X = vectorizer.fit_transform(newsgroups_train.data)
clf = CDClassifier(penalty="l1/l2",
                   loss="squared_hinge",
                   multiclass=True,
                   max_iter=20,
                   alpha=1e-4,
                   C=1.0 / tfidf_X.shape[0],
                   tol=1e-3)
clf.fit(tfidf_X, newsgroups_train.target)

corpus = st.CorpusFromScikit(
	X=CountVectorizer(vocabulary=vectorizer.vocabulary_).fit_transform(newsgroups_train.data),
	y=newsgroups_train.target,
	feature_vocabulary=vectorizer.vocabulary_,
	category_names=newsgroups_train.target_names,
	raw_texts=newsgroups_train.data
).build()

html = st.produce_frequency_explorer(
	corpus,
	'alt.atheism',
	scores=clf.coef_[0],
Esempio n. 24
0

print "### BASELINE GROUP LASSO in pure python/numpy###"
X = X_train
y = y_train
clf = ogroup.BaselineGroupLasso(max_iter=30, alpha=.5, max_steps=30)
clf.fit(X, y, groups)
print "Acc:", clf.score(X, y)
print clf.coefs_

print "### Equivalent Lightning Cython Implementation ###"
light_clf = CDClassifier(penalty="l1/l2",
                         loss="squared_hinge",
                         multiclass=True,
                         max_iter=clf.max_iter,
                         alpha=1e-4, # clf.alpha,
                         C=1.0 / X.shape[0],
                         tol=clf.tol,
                         permute=False,
                         verbose=3,
                         random_state=0).fit(X, y)
print "Acc:", light_clf.score(X, y)
print light_clf.coef_.T

import numpy as np
data = np.load('3ng_train.npz')
X = data['X'].item()
Xaug = data['Xaug'].item()
y = data['y']
groups = data['groups']
clf.fit(Xaug, y, groups)
print clf.score(Xaug, y)
Esempio n. 25
0
def main():
    desc = '''
Learns a multi-class classification model that discriminates across clusters.

The path(s) to the feature matrices are read from STDIN. Each path should contain
be an npz file containing the feature matrix for a different cluster. Each cluster
will be considered as a separate class. A multi-class classification model will
be trained on a fraction of the data (controlled by the --train parameter). The 
rest of the data will be split on a test and a validation set of equal sizes.'''
    parser = argparse.ArgumentParser(description = desc)
    parser.add_argument('outfile')
    parser.add_argument('--alpha', type = float, default = 0.01,
                        help = 'Coefficient of the penalty term.')
    parser.add_argument('--tol', type = float, default = 0.01,
                        help = 'Tolerance for the termination criterion.')
    parser.add_argument('--train', type = float, default = 0.8,
                        help = 'Fraction of examples used for training. [%(default)s]')
    parser.add_argument('--maxfreq', type = float, default = 0.3,
                        help = 'Maximum frequency for a feature to be considered. [%(default)s]')
    parser.add_argument('--log', action = 'store_true', default = False,
                        help = 'Use logistic regression')
    args = parser.parse_args()
    outfile = args.outfile
    alpha = args.alpha
    assert(alpha >= 0)
    train_prc = args.train
    assert(train_prc > 0 and train_prc < 1)
    max_freq = args.maxfreq
    assert(max_freq >= 0 and max_freq <= 1)
        
    files = []
    for filename in fileinput.input([]):
        files.append(filename.strip())
        
    (scores, rule_names) = merge_scores(files, vertical = True)
    
    y = np.repeat(np.arange(len(files)), scores.shape[0] / len(files))

    if args.log:
        model = LogisticRegression(penalty = 'l1', C = alpha, 
                                   tol = args.tol, random_state = 1)
    else:
        model = CDClassifier(penalty = 'l1/l2', loss = 'squared_hinge', multiclass = True, 
                             max_iter = 100, alpha = alpha, C = 1.0 / y.size, 
                             shrinking = False, # weird behavior if this is set to True
                             tol = args.tol, random_state = 1, verbose = 2)

    numpy.random.seed(1)
    perm = numpy.random.permutation(len(y))
    y = y[perm]
    scores = scores[perm, :]

    hits = np.sum(scores, axis = 0) / float(scores.shape[0])
    print >> sys.stderr, 'Max frequency', np.max(hits)
    print >> sys.stderr, 'Num features with freq >', max_freq, np.sum(hits > max_freq)

    sel_feat = np.argwhere(hits <= max_freq).flatten()
    rule_names = list(np.array(rule_names)[sel_feat])
    scores = scores[:, sel_feat]
    print >> sys.stderr, 'Scores shape', scores.shape

    # Get balanced training, test, and validation sets.
    cv = StratifiedShuffleSplit(y, 1, 1.0 - train_prc, indices = True)
    for train, test in cv:
        train_idx = train
        test_tmp = test
    # Now split the test set (which is balanced by design) into two balanced parts.
    cv = StratifiedShuffleSplit(y[test_tmp], 1, 0.5, indices = True)
    for train, test in cv: 
        test_idx = test_tmp[train]
        val_idx = test_tmp[test]
    
    assert(len(set(train_idx).intersection(set(test_idx))) == 0)
    assert(len(set(val_idx).intersection(set(test_idx))) == 0)
    assert(len(set(train_idx).intersection(set(val_idx))) == 0)
    print >> sys.stderr, 'Will use', len(train_idx), 'examples for training,', \
        len(test_idx), ' for testing, and', len(val_idx), 'for validation'
    all_idx = [train_idx, val_idx, test_idx]
    #assert(np.sum([len(i) for i in all_idx]) == y.size)

    model.fit(sp.csr_matrix(scores[train_idx, :], dtype = np.float), y[train_idx])
    acc = []
    confusion = []
    for idx in all_idx:
        pred = model.predict(sp.csr_matrix(scores[idx, :], dtype = np.float))
        acc.append(accuracy_score(y[idx], pred))
        confusion.append(confusion_matrix(y[idx], pred))
    
    with open(args.outfile, 'wb') as outfile:
        pickle.dump(model.coef_, outfile)
        pickle.dump(rule_names, outfile)
        pickle.dump(acc, outfile)
        pickle.dump(confusion, outfile)
# split data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_type,
                                                    test_size=0.1,
                                                    random_state=42)

# build model
# prepare parameters
params = dict(alpha=[0.001], C=[0.0001])

# create and fit a ridge regression model, testing each alpha
clf = CDClassifier(penalty="l1/l2",
                   loss="log",
                   multiclass=True,
                   max_iter=20,
                   alpha=1e-4,
                   verbose=1,
                   C=1.0,
                   tol=1e-3)

grid = GridSearchCV(estimator=clf, param_grid=params)
grid.fit(X_train, y_train)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_)

bst = grid.best_estimator_
bst.predict(X_test)
bst.score(X_test, y_test)