Beispiel #1
0
def classifier_train(train_features,
                     train_labels,
                     test_features,
                     svm_eps = 1e-5,
                     svm_C = 10**4,
                     classifier_type = "liblinear"
                     ):
    """ Classifier training using SVMs

    Input:
    train_features = training features (both positive and negative)
    train_labels = corresponding label vector
    svm_eps = eps of svm
    svm_C = C parameter of svm
    classifier_type = liblinear or libsvm"""

    #sphering
    train_features, test_features = __sphere(train_features, test_features)

    if classifier_type == 'liblinear':
        clf = svm.LinearSVC(eps = svm_eps, C = svm_C)
    if classifier_type == 'libSVM':
        clf = svm.SVC(eps = svm_eps, C = svm_C, probability = True)
    elif classifier_type == 'LRL1':
        clf = LogisticRegression(C=svm_C, penalty = 'l1')
    elif classifier_type == 'LRL2':
        clf = LogisticRegression(C=svm_C, penalty = 'l1')

    clf.fit(train_features, train_labels)
    
    return clf
Beispiel #2
0
def makeLogisticRegression(inData, regularization, penalty):
    print "Logistic regression parameters"
    print "Regularization  = ", regularization
    print "Penalty (L1/L2) = ", penalty
    print "Starting logistic regression calculation"
    sys.stdout.flush()
    xData, yData = getXYData(inData)
    logRegClassifier = LogisticRegression(C=regularization, penalty=penalty)
    logRegClassifier.fit(xData, yData)
    print "Finished logistic regression calculation"
    return logRegClassifier
Beispiel #3
0
def test_dense_tf_idf():
    hv = HashingVectorizer(dim=1000, probes=3)
    hv.vectorize(JUNK_FOOD_DOCS)
    hv.vectorize(NOTJUNK_FOOD_DOCS)

    # extract the TF-IDF data
    X = hv.get_tfidf()
    assert_equal(X.shape, (11, 1000))

    # label junk food as -1, the others as +1
    y = np.ones(X.shape[0])
    y[:6] = -1

    # train and test a classifier
    clf = LogisticRegression().fit(X[1:-1], y[1:-1])
    assert_equal(clf.predict([X[0]]), [-1])
    assert_equal(clf.predict([X[-1]]), [1])
Beispiel #4
0
def AUCkFoldLogisticRegression(regularization, inData, penalty, kFolds):
    print "\n\tCalculating AUC for regularization", regularization, "using", kFolds, "folds"
    sys.stdout.flush()
    xData, yData = getXYData(inData)
    nSamples, nFeatures = xData.shape
    if nSamples % kFolds != 0:
        raise UserWarning(
            "Uneven fold sizes! Must evenly divide 5922 (e.g. 2,3,7 or 9 folds"
        )
        # 2, 3, 7, and 9 are factors of 5922 (#data points) & yield equal fold sizes
    crossValFolds = KFold(nSamples, kFolds)
    yTestDataAllFolds = array([])
    probasTestDataAllFolds = array([])
    sumAUC = 0.0
    for foldNum, (train, test) in enumerate(crossValFolds):
        # fit a new LR model for each fold's data & evaluate using AUC
        LRclassifier = LogisticRegression(C=regularization, penalty=penalty)
        probas_ = LRclassifier.fit(xData[train],
                                   yData[train]).predict_proba(xData[test])
        numNon0Coefs = sum(
            [1 for coef in LRclassifier.coef_[:][0] if coef != 0])
        # probas_ contains 2 columns of probabilities, one for each of the 2 classes (0,1)
        # In the documentation, seems like col 1 is for class 1,
        # but tests show it seems like col 0 is for class 1, so we use that below.
        CLASS_1_COL = 0
        # Compute ROC curve and area under the curve
        FPR, TPR, thresholds = roc(yData[test], probas_[:, CLASS_1_COL])
        roc_auc = auc(FPR, TPR)
        print "\tFold:", foldNum, " AUC:", roc_auc, "Non0Coefs:", numNon0Coefs,
        print "Reg:", regularization,
        print localTimeString()
        sys.stdout.flush()
        sumAUC += roc_auc
        yTestDataAllFolds = numpy.concatenate((yTestDataAllFolds, yData[test]))
        probasTestDataAllFolds = \
                numpy.concatenate((probasTestDataAllFolds,probas_[:,CLASS_1_COL]) )
    FPRallFolds, TPRallFolds, thresholds = roc(yTestDataAllFolds,
                                               probasTestDataAllFolds)
    roc_auc_allFolds = auc(FPRallFolds, TPRallFolds)
    print "AUC_all_folds:", roc_auc_allFolds,
    print "Reg:", regularization, "Penalty:", penalty, "kFolds:", kFolds,
    print localTimeString()
    return roc_auc_allFolds
# $Id$

import numpy as np

from scikits.learn.logistic import LogisticRegression
from scikits.learn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target

# Set regularization parameter
C = 0.1

classifier_l1_LR = LogisticRegression(C=C, penalty='l1')
classifier_l2_LR = LogisticRegression(C=C, penalty='l2')
classifier_l1_LR.fit(X, y)
classifier_l2_LR.fit(X, y)

hyperplane_coefficients_l1_LR = classifier_l1_LR.coef_[:]
hyperplane_coefficients_l2_LR = classifier_l2_LR.coef_[:]

# hyperplane_coefficients_l1_LR contains zeros due to the
# L1 sparsity inducing norm

pct_non_zeros_l1_LR = np.mean(hyperplane_coefficients_l1_LR != 0) * 100
pct_non_zeros_l2_LR = np.mean(hyperplane_coefficients_l2_LR != 0) * 100

print "Percentage of non zeros coefficients (L1) : %f" % pct_non_zeros_l1_LR
print "Percentage of non zeros coefficients (L2) : %f" % pct_non_zeros_l2_LR