Example #1
0
	def test_ensure_unit_variance(self):
		data = np.linspace(1,50).reshape((10,5))
		stds = [np.std(column) for column in data.T]
		uv_data = dp.ensure_unit_variance(data, [0,2,3])
		
		data[:,0] = (data[:,0] - np.mean(data[:,0])) / stds[0]
		data[:,2] = (data[:,2] - np.mean(data[:,2])) / stds[2]
		data[:,3] = (data[:,3] - np.mean(data[:,3])) / stds[3]
		
		for target, actual in zip(data.T, uv_data.T):
			self.assertAlmostEqual(np.mean(actual), np.mean(target))
			self.assertAlmostEqual(np.std(actual), np.std(target))
Example #2
0
	########## Training phase ##########
	# In this phase the original (labeled) training data is split into a new set of training and test data.
	# These sets are used to determine the optimal parameters (number of neighbors) for a NN classifier.
	# For each tested classifier, the score is calculated and printed. Additionally, the log loss as applied
	# by kaggle is calculated and printed for the best classifier.

	train_path = 'data/train.csv'
	predictions_path = 'data/predictions.csv'

	# Load training data
	data = importer.read_labeled(train_path, 3000) # Read at most 3000 data points

	data = dapo.vectorize(data, 1, features=[('latitude', 7), ('longitude', 8), ('day', 0), ('day_of_week', 0), ('time', 0), ('streets', 6)])
	crime_to_id_dict = data.next()
	data = importer.to_numpy_array(data) # Collect data in array
	data = dapo.ensure_unit_variance(data, columns_to_normalize=(0, 1, 2, 3, 4)) # Ensure unit variance in appropriate columns

	# Separate labels from data
	crime_ids = data[:,-1].astype(int) # Crime ids are in the last column, and are integer values
	locations = data[:,:-1] # The rest is data

	# Calculate ranges for the modulo used on circular quantities
	modulo_for_day = abs( min(locations[:,2]) - max(locations[:,2]) )
	modulo_for_day_of_week = abs( min(locations[:,3]) - max(locations[:,3]) )
	modulo_for_time = abs( min(locations[:,4]) - max(locations[:,4]) )
	modulae = (modulo_for_day, modulo_for_day_of_week, modulo_for_time)

	# Split into train and test set
	loc_train, loc_test, crime_ids_train, crime_ids_test = cv.train_test_split(locations, crime_ids, test_size=0.33)

	# Train and evaluate
Example #3
0
import numpy as np
import importer as im
import evaluation as ev
import data_processing as dp
from sklearn.svm import SVC
import sklearn.cross_validation as cv

# load data
path = "../data/train.csv"
data = im.read_labeled(path, 10000)
data = dp.vectorize(data, 1, features=[('latitude', 7), ('longitude', 8), ('day', 0), ('day_of_week', 0), ('time', 0)])
crime_to_id_dict = data.next()
data = im.to_numpy_array(data)
data = dp.ensure_unit_variance(data, columns_to_normalize=(0,1,2,3,4))

# separate data in features and labels
Y = data[:,5].astype(int)
X = data[:,:5]

# split data in training data and test data
train_X, test_X, train_Y, test_Y = cv.train_test_split(X, Y, test_size=0.33)

# run svm for several values for C
for c in [0.1, 1, 1.5, 10,20,50,100,200]:
	print "C = {0}".format(c)
	
	# create SVM
	clf = SVC(C=c,kernel='rbf', gamma=1000)
	
	# fit SVM
	clf.fit(train_X, train_Y)