def runOnSplit(penalties, constants, split):
	"Running on a " + str(split*100) + '/' + str((1-split)*100) + ' split' 
	fe = FeatureExtractor(split)
	featurized = fe.featurizeFiles('../data')
	classNames = featurized[0]
	trainMatrix, trainLabels = featurized[1:3]
	devMatrix, devLabels = featurized[3:5]
	trainFiles, devFiles = featurized[5:]


	classCounts = Counter()
	for l in devLabels:
		classCounts[l] += 1

	for penalty in penalties:
		for C in constants:
			print "\nPenalty, regularization: ", str(penalty), str(C)

			abstractModel = LogisticRegression()
			model = abstractModel.scikit(penalty, C)
			model_params = (penalty, C)
			model.fit(trainMatrix, trainLabels)

			errors, rankedExamples = Counter(), []

			score = model.score(devMatrix, devLabels)
			predicted_labels = model.predict(devMatrix)

			probs = model.predict_proba(devMatrix)

			for j,pred in enumerate(predicted_labels):
				if not pred == devLabels[j]:
					errors[devLabels[j]] += 1

			for i, p in enumerate(probs):
				rankedExamples.append((p, devFiles[i], predicted_labels[i] == devLabels[i]))		

			results = ''
			for i, c in enumerate(classNames):
				missRate = str(float(errors[i]) / classCounts[i])
				results += '\t' + c + ' error: ' + missRate + '\n'

			results += '\tScore: ' + str(score)
			fileName = 'results/scores/LRsplit'
			for param in model_params:
				fileName += '_' + str(param)
			fileName += '.txt'
			with open(fileName, 'w') as f:
				f.write(results)
			print results

			print '..ranking examples'
			if len(rankedExamples):
				examples = sorted(rankedExamples, key=lambda e: e[0][0])
				fileName = 'results/rankedExamples/LRsplit_' + str(split*100)
				for param in model_params:
					fileName += '_' + str(param)
				fileName += '.txt'
				with open(fileName,'w') as f:
					for e in examples:
						results = e[1]
						results += '\n\t Probability of class '
						results += classNames[0] + ': '
						results += str(e[0][0])
						results += '\n\t Correct: ' + str(e[2])
						f.write(results)
Example #2
0
# import vectorizeFiles as VF
from sklearn.neighbors import KNeighborsClassifier#, DistanceMetric
# import numpy as np
# import getFileNames as gf
# import sys
# import scipy
from sklearn import grid_search
from feature_extractor import FeatureExtractor


fe = FeatureExtractor(1)
featurized = fe.featurizeFiles('../data')
classNames, repubAndDemMatrix, labels = featurized[:3]
# [repubAndDemMatrix,vectorizerRepubDem,labels]=VF.extractWordCounts(True,True,False)
parameters = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10]}
#,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]}
#'weights':('uniform','distance'), 'p':[1, 2, 3, 4, 5]
#'metric':('euclidean', 'manhattan','chebyshev','minkowski','jaccard','maching','dice','kulsinki','rogerstanimoto','russellrao','sokalmichener','sokalsneath'), 
kn = KNeighborsClassifier()
clf = grid_search.GridSearchCV(kn, parameters)
clf.fit(repubAndDemMatrix, labels)
print clf.best_estimator_ #<-lots of detail
print clf.best_params_ #<-more useful
print clf.best_score_ #<-this is the cv error
print clf.score(repubAndDemMatrix, labels) #<-training error


#optimal parameter of 4 neighbors, best test error is 0.668573607933, best training error is 0.828488372093

# if we use shuffles the training data so that it is not all democrats and then all republicans,
# we get an optimal param of 1 neighbor, .689 test error, 1.0 training error