Python FeatureExtractor.featurizeFiles Examples

Programming Language: Python

Namespace/Package Name: feature_extractor

Class/Type: FeatureExtractor

Method/Function: featurizeFiles

Examples at hotexamples.com: 2

Python FeatureExtractor.featurizeFiles - 2 examples found. These are the top rated real world Python examples of feature_extractor.FeatureExtractor.featurizeFiles extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FeatureExtractor(30)

extract(20)

extract_features(14)

clear(9)

__init__(6)

Init(4)

extract_spatial_features(3)

extractFeatures(3)

extract_hog_features(2)

dependency(2)

extract_lemma_ngrams(2)

extract_log_mel_filterbank_energies(2)

compute_n_chars(2)

compute_feature_vector(2)

compute_document_length(2)

compute_descriptors(2)

extract_histogram_features(2)

close(2)

draw_keypoints(2)

chunk(2)

extract_word_ngrams(2)

featurizeFiles(2)

extractFeaturesDirectFromText(2)

extract_batch(2)

ne(2)

ngrams(2)

srl(2)

extract_full_feature_matrix(1)

extract_from_paths(1)

Client(1)

extract_single_image_features(1)

extract_hist_features(1)

extract_info(1)

extract_ngram_list(1)

get_dbp_sparql(1)

get_dtw_features(1)

get_fft_features(1)

get_freq_features(1)

get_minmax_features(1)

extract_from_img(1)

execute(1)

extract_features_fast(1)

categories(1)

Processor(1)

activate_tensor_board(1)

bcv(1)

bin_spatial(1)

build(1)

build_bag(1)

build_tfidf(1)

Example #1

Show file

File: logistic_regression.py Project: bmccann/party_predictor

def runOnSplit(penalties, constants, split):
	"Running on a " + str(split*100) + '/' + str((1-split)*100) + ' split' 
	fe = FeatureExtractor(split)
	featurized = fe.featurizeFiles('../data')
	classNames = featurized[0]
	trainMatrix, trainLabels = featurized[1:3]
	devMatrix, devLabels = featurized[3:5]
	trainFiles, devFiles = featurized[5:]


	classCounts = Counter()
	for l in devLabels:
		classCounts[l] += 1

	for penalty in penalties:
		for C in constants:
			print "\nPenalty, regularization: ", str(penalty), str(C)

			abstractModel = LogisticRegression()
			model = abstractModel.scikit(penalty, C)
			model_params = (penalty, C)
			model.fit(trainMatrix, trainLabels)

			errors, rankedExamples = Counter(), []

			score = model.score(devMatrix, devLabels)
			predicted_labels = model.predict(devMatrix)

			probs = model.predict_proba(devMatrix)

			for j,pred in enumerate(predicted_labels):
				if not pred == devLabels[j]:
					errors[devLabels[j]] += 1

			for i, p in enumerate(probs):
				rankedExamples.append((p, devFiles[i], predicted_labels[i] == devLabels[i]))		

			results = ''
			for i, c in enumerate(classNames):
				missRate = str(float(errors[i]) / classCounts[i])
				results += '\t' + c + ' error: ' + missRate + '\n'

			results += '\tScore: ' + str(score)
			fileName = 'results/scores/LRsplit'
			for param in model_params:
				fileName += '_' + str(param)
			fileName += '.txt'
			with open(fileName, 'w') as f:
				f.write(results)
			print results

			print '..ranking examples'
			if len(rankedExamples):
				examples = sorted(rankedExamples, key=lambda e: e[0][0])
				fileName = 'results/rankedExamples/LRsplit_' + str(split*100)
				for param in model_params:
					fileName += '_' + str(param)
				fileName += '.txt'
				with open(fileName,'w') as f:
					for e in examples:
						results = e[1]
						results += '\n\t Probability of class '
						results += classNames[0] + ': '
						results += str(e[0][0])
						results += '\n\t Correct: ' + str(e[2])
						f.write(results)

Example #2

Show file

File: KNeighbors.py Project: bmccann/party_predictor

# import vectorizeFiles as VF
from sklearn.neighbors import KNeighborsClassifier#, DistanceMetric
# import numpy as np
# import getFileNames as gf
# import sys
# import scipy
from sklearn import grid_search
from feature_extractor import FeatureExtractor


fe = FeatureExtractor(1)
featurized = fe.featurizeFiles('../data')
classNames, repubAndDemMatrix, labels = featurized[:3]
# [repubAndDemMatrix,vectorizerRepubDem,labels]=VF.extractWordCounts(True,True,False)
parameters = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10]}
#,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]}
#'weights':('uniform','distance'), 'p':[1, 2, 3, 4, 5]
#'metric':('euclidean', 'manhattan','chebyshev','minkowski','jaccard','maching','dice','kulsinki','rogerstanimoto','russellrao','sokalmichener','sokalsneath'), 
kn = KNeighborsClassifier()
clf = grid_search.GridSearchCV(kn, parameters)
clf.fit(repubAndDemMatrix, labels)
print clf.best_estimator_ #<-lots of detail
print clf.best_params_ #<-more useful
print clf.best_score_ #<-this is the cv error
print clf.score(repubAndDemMatrix, labels) #<-training error


#optimal parameter of 4 neighbors, best test error is 0.668573607933, best training error is 0.828488372093

# if we use shuffles the training data so that it is not all democrats and then all republicans,
# we get an optimal param of 1 neighbor, .689 test error, 1.0 training error