def classifier_train(train_features, train_labels, test_features, svm_eps = 1e-5, svm_C = 10**4, classifier_type = "liblinear" ): """ Classifier training using SVMs Input: train_features = training features (both positive and negative) train_labels = corresponding label vector svm_eps = eps of svm svm_C = C parameter of svm classifier_type = liblinear or libsvm""" #sphering train_features, test_features = __sphere(train_features, test_features) if classifier_type == 'liblinear': clf = svm.LinearSVC(eps = svm_eps, C = svm_C) if classifier_type == 'libSVM': clf = svm.SVC(eps = svm_eps, C = svm_C, probability = True) elif classifier_type == 'LRL1': clf = LogisticRegression(C=svm_C, penalty = 'l1') elif classifier_type == 'LRL2': clf = LogisticRegression(C=svm_C, penalty = 'l1') clf.fit(train_features, train_labels) return clf
def makeLogisticRegression(inData, regularization, penalty): print "Logistic regression parameters" print "Regularization = ", regularization print "Penalty (L1/L2) = ", penalty print "Starting logistic regression calculation" sys.stdout.flush() xData, yData = getXYData(inData) logRegClassifier = LogisticRegression(C=regularization, penalty=penalty) logRegClassifier.fit(xData, yData) print "Finished logistic regression calculation" return logRegClassifier
def test_dense_tf_idf(): hv = HashingVectorizer(dim=1000, probes=3) hv.vectorize(JUNK_FOOD_DOCS) hv.vectorize(NOTJUNK_FOOD_DOCS) # extract the TF-IDF data X = hv.get_tfidf() assert_equal(X.shape, (11, 1000)) # label junk food as -1, the others as +1 y = np.ones(X.shape[0]) y[:6] = -1 # train and test a classifier clf = LogisticRegression().fit(X[1:-1], y[1:-1]) assert_equal(clf.predict([X[0]]), [-1]) assert_equal(clf.predict([X[-1]]), [1])
def AUCkFoldLogisticRegression(regularization, inData, penalty, kFolds): print "\n\tCalculating AUC for regularization", regularization, "using", kFolds, "folds" sys.stdout.flush() xData, yData = getXYData(inData) nSamples, nFeatures = xData.shape if nSamples % kFolds != 0: raise UserWarning( "Uneven fold sizes! Must evenly divide 5922 (e.g. 2,3,7 or 9 folds" ) # 2, 3, 7, and 9 are factors of 5922 (#data points) & yield equal fold sizes crossValFolds = KFold(nSamples, kFolds) yTestDataAllFolds = array([]) probasTestDataAllFolds = array([]) sumAUC = 0.0 for foldNum, (train, test) in enumerate(crossValFolds): # fit a new LR model for each fold's data & evaluate using AUC LRclassifier = LogisticRegression(C=regularization, penalty=penalty) probas_ = LRclassifier.fit(xData[train], yData[train]).predict_proba(xData[test]) numNon0Coefs = sum( [1 for coef in LRclassifier.coef_[:][0] if coef != 0]) # probas_ contains 2 columns of probabilities, one for each of the 2 classes (0,1) # In the documentation, seems like col 1 is for class 1, # but tests show it seems like col 0 is for class 1, so we use that below. CLASS_1_COL = 0 # Compute ROC curve and area under the curve FPR, TPR, thresholds = roc(yData[test], probas_[:, CLASS_1_COL]) roc_auc = auc(FPR, TPR) print "\tFold:", foldNum, " AUC:", roc_auc, "Non0Coefs:", numNon0Coefs, print "Reg:", regularization, print localTimeString() sys.stdout.flush() sumAUC += roc_auc yTestDataAllFolds = numpy.concatenate((yTestDataAllFolds, yData[test])) probasTestDataAllFolds = \ numpy.concatenate((probasTestDataAllFolds,probas_[:,CLASS_1_COL]) ) FPRallFolds, TPRallFolds, thresholds = roc(yTestDataAllFolds, probasTestDataAllFolds) roc_auc_allFolds = auc(FPRallFolds, TPRallFolds) print "AUC_all_folds:", roc_auc_allFolds, print "Reg:", regularization, "Penalty:", penalty, "kFolds:", kFolds, print localTimeString() return roc_auc_allFolds
# $Id$ import numpy as np from scikits.learn.logistic import LogisticRegression from scikits.learn import datasets iris = datasets.load_iris() X = iris.data y = iris.target # Set regularization parameter C = 0.1 classifier_l1_LR = LogisticRegression(C=C, penalty='l1') classifier_l2_LR = LogisticRegression(C=C, penalty='l2') classifier_l1_LR.fit(X, y) classifier_l2_LR.fit(X, y) hyperplane_coefficients_l1_LR = classifier_l1_LR.coef_[:] hyperplane_coefficients_l2_LR = classifier_l2_LR.coef_[:] # hyperplane_coefficients_l1_LR contains zeros due to the # L1 sparsity inducing norm pct_non_zeros_l1_LR = np.mean(hyperplane_coefficients_l1_LR != 0) * 100 pct_non_zeros_l2_LR = np.mean(hyperplane_coefficients_l2_LR != 0) * 100 print "Percentage of non zeros coefficients (L1) : %f" % pct_non_zeros_l1_LR print "Percentage of non zeros coefficients (L2) : %f" % pct_non_zeros_l2_LR