예제 #1
0
def report_status_selection(selection):
	[dataset, features] = parse_theme(selection)
	[known_dataset, known_targets, unk] = split_dataset(dataset, targets)

	feats = feature_context(known_dataset, known_targets, features)
	print selection
	print feats
	print 'Nr selected features %d' % len(feats)
	print 'Nr total features %d' % len(features)
	print 'Features eliminated %s' % set(features).difference(feats)
	return feats
def thematic_data_from_feature_selection(orig_targets, theme, target):
	[dataset, features] = parse_theme(theme)
	[known_dataset, known_targets, unk] = split_dataset(dataset, orig_targets)
	
	nr_times = int(math.floor(TOP_FEATURES_PERCENTAGE_THRESHOLD * len(features)))

	known_targets = np.asarray(known_targets)
	ssa_features = select_proxy_features(theme, target, nr_times)
	sf = SelectedFeatures(known_dataset, known_targets, ssa_features, features)

	print '####### %s FEATURES ####### %d %s' % (theme, len(ssa_features), str(ssa_features)) 

	return sf.extract_data_from_selected_features(), known_targets
def cv(theme, percentage, current_svm):
	[dataset, features] = parse_theme(theme)
	[known_dataset, known_targets, unk] = split_dataset(dataset, targets)
	known_targets = np.asarray(known_targets)

	# cv_features = features_cross_validation(known_dataset, known_targets, features, current_svm)
	# selected_features = select_final_features_from_cv(cv_features, percentage)
	selected_features = select_features(percentage, theme)

	sf = SelectedFeatures(known_dataset, known_targets, selected_features, features)
	combined_dataset = sf.extract_data_from_selected_features()

	std = StandardizedData(known_targets, combined_dataset)
	known_dataset_scaled, known_targets = std.split_and_standardize_dataset()  

	print '####### FEATURES ####### %d \n %s' % (len(selected_features), str(selected_features)) 	
	return cross_validation(np.array(known_dataset_scaled), known_targets, ids, current_svm)
예제 #4
0
def thematic_data_from_feature_selection(orig_targets, theme, percentage):
	[dataset, features] = parse_theme(theme)
	[known_dataset, known_targets, unk] = split_dataset(dataset, orig_targets)
	
	known_targets = np.asarray(known_targets)

	# these come from feature_selection_cv
	# commented out because they were saved to decrease computation time
	# cv_features = features_cross_validation(known_dataset, known_targets, features)
	# selected_features = select_final_features_from_cv(cv_features, percentage)
	selected_features = select_features(percentage, theme)

	sf = SelectedFeatures(known_dataset, known_targets, selected_features, features)

	print '####### %s FEATURES ####### %d %s' % (theme, len(selected_features), str(selected_features)) 

	return sf.extract_data_from_selected_features(), known_targets
예제 #5
0
import sys
sys.path.insert(0, 'utils/')
from load_data import *
from project_data import *
from parse_theme import *
from split_dataset import *

import numpy as np

if __name__ == "__main__":
	spreadsheet = Spreadsheet(project_data_file)
	data = Data(spreadsheet)
	targets = data.targets

	[dataset, features] = parse_theme('all')
	[known_dataset, known_targets, unk] = split_dataset(dataset, targets)

	print 'NEG %d' % len([x for x in known_targets if x==0])
	print 'POS %d' % len([x for x in known_targets if x==1])

	print 'HIGHVAL %d' % len([x for x in known_targets if x==1])
	print 'CIVIL %d' % len([x for x in known_targets if x==2])
예제 #6
0
def split_dataset(dataset, targets):
	unknowns = []
	known_dataset = []
	known_targets = []
	for i in range(0, len(targets)):
		if targets[i] == 0:
			unknowns.append(dataset[i])
		else:	
			known_dataset.append(dataset[i])
			known_targets.append(targets[i])

	return [np.array(known_dataset), known_targets, np.array(unknowns)]	

def decision_tree(dataset, targets):
	[known_dataset, known_targets, unknowns, ] = split_dataset(dataset, targets)

	model = DecisionTreeClassifier(criterion='entropy')
	model.fit(known_dataset, known_targets)
	print 'Model score: %f' % model.score(known_dataset, known_targets)	
	print model.feature_importances_
	with open("tree.dot", 'w') as f:
		f = export_graphviz(model, out_file=f)
		### need to dot -Tpdf tree.dot -o tree.pdf

if __name__ == "__main__":
	spreadsheet = Spreadsheet(project_data_file)
	data = Data(spreadsheet)
	targets = data.targets

	[dataset, features] = parse_theme(sys.argv[1])
	decision_tree(dataset, targets)
	print 'Civil recall %f' % cr
	print 'Civil f1 %f' % cf

	return error_rate, f1, (hp, hr, hf), (cp, cr, cf)

if __name__ == "__main__":

	training_spreadsheet = Spreadsheet(project_data_file)
	training_data = Data(training_spreadsheet)
	training_targets = training_data.targets

	testing_spreadsheet = Spreadsheet(addendum_data_file, upsampling=False)
	testing_data = Data(testing_spreadsheet, upsampling=False)
	testing_targets = testing_data.targets

	[training_data, features] = parse_theme('all')
	[testing_data, feats] = parse_theme_from_file('all', addendum_data_file)
	assert features == feats

	[training_data, training_targets, unk] = split_dataset(training_data, training_targets)
	selected_features = single_features_90
	sf = SelectedFeatures(training_data, training_targets, selected_features, features)
	training_data = sf.extract_data_from_selected_features()

	sf = SelectedFeatures(testing_data, testing_targets, selected_features, features)
	testing_data = sf.extract_data_from_selected_features()
		
	# standardize dataset - Gaussian with zero mean and unit variance
	scaler = StandardScaler()

	testing_data = replace_missings(testing_data)
예제 #8
0
	def get_known_data_from_theme(self, theme):
		[theme_dataset, theme_features] = parse_theme(theme)
		[known_dataset, known_targets, unk] = split_dataset(theme_dataset, self.targets)
		known_targets = np.asarray(known_targets)
		return [known_dataset, known_targets]
예제 #9
0
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))

if __name__ == "__main__":
	spreadsheet = Spreadsheet(project_data_file)
	data = Data(spreadsheet)
	targets = data.targets
	ids = data.ids

	theme = raw_input("Theme.\n")
	percentage = float(raw_input("Percentage as float.\n"))

	[dataset, features] = parse_theme(theme)
	[known_dataset, known_targets, unk] = split_dataset(dataset, targets)
	known_targets = np.asarray(known_targets)
	
	selected_features = select_features(percentage, theme)
	sf = SelectedFeatures(known_dataset, known_targets, selected_features, features)
	dataset = sf.extract_data_from_selected_features()

	dataset = preprocessing.scale(dataset)

	C_range = np.arange(0.1, 9, 0.1)
	gamma_range = np.arange(0.1, 9, 0.1)
	param_grid = dict(gamma=gamma_range, C=C_range)
	# cv = StratifiedShuffleSplit(known_targets, random_state=42)
	cv = StratifiedKFold(known_targets, n_folds=10)
	grid = GridSearchCV(SVC(class_weight='auto'), param_grid=param_grid, cv=cv, scoring='f1')