Beispiel #1
0
import numpy as np
import importer as im
import evaluation as ev
import data_processing as dp
from sklearn.svm import SVC
import sklearn.cross_validation as cv

# load data
path = "../data/train.csv"
data = im.read_labeled(path, 10000)
data = dp.vectorize(data, 1, features=[('latitude', 7), ('longitude', 8), ('day', 0), ('day_of_week', 0), ('time', 0)])
crime_to_id_dict = data.next()
data = im.to_numpy_array(data)
data = dp.ensure_unit_variance(data, columns_to_normalize=(0,1,2,3,4))

# separate data in features and labels
Y = data[:,5].astype(int)
X = data[:,:5]

# split data in training data and test data
train_X, test_X, train_Y, test_Y = cv.train_test_split(X, Y, test_size=0.33)

# run svm for several values for C
for c in [0.1, 1, 1.5, 10,20,50,100,200]:
	print "C = {0}".format(c)
	
	# create SVM
	clf = SVC(C=c,kernel='rbf', gamma=1000)
	
	# fit SVM
	clf.fit(train_X, train_Y)
Beispiel #2
0
	which for each sample, indicates the estimated proabilities of of the various learned crime types.
	'''
	return knn_c.predict_proba(data)

if __name__ == '__main__':
	########## Training phase ##########
	# In this phase the original (labeled) training data is split into a new set of training and test data.
	# These sets are used to determine the optimal parameters (number of neighbors) for a NN classifier.
	# For each tested classifier, the score is calculated and printed. Additionally, the log loss as applied
	# by kaggle is calculated and printed for the best classifier.

	train_path = 'data/train.csv'
	predictions_path = 'data/predictions.csv'

	# Load training data
	data = importer.read_labeled(train_path, 3000) # Read at most 3000 data points

	data = dapo.vectorize(data, 1, features=[('latitude', 7), ('longitude', 8), ('day', 0), ('day_of_week', 0), ('time', 0), ('streets', 6)])
	crime_to_id_dict = data.next()
	data = importer.to_numpy_array(data) # Collect data in array
	data = dapo.ensure_unit_variance(data, columns_to_normalize=(0, 1, 2, 3, 4)) # Ensure unit variance in appropriate columns

	# Separate labels from data
	crime_ids = data[:,-1].astype(int) # Crime ids are in the last column, and are integer values
	locations = data[:,:-1] # The rest is data

	# Calculate ranges for the modulo used on circular quantities
	modulo_for_day = abs( min(locations[:,2]) - max(locations[:,2]) )
	modulo_for_day_of_week = abs( min(locations[:,3]) - max(locations[:,3]) )
	modulo_for_time = abs( min(locations[:,4]) - max(locations[:,4]) )
	modulae = (modulo_for_day, modulo_for_day_of_week, modulo_for_time)
Beispiel #3
0
	fig.suptitle("Crimes in San Francisco", fontsize=14, fontweight = "bold")
	
	ax = fig.add_subplot(111)
	ax.set_title(title)
	
	ax.set_xlabel('Horizontal GPS coordinates')
	ax.set_ylabel('Vertical GPS coordinates')
	
	plt.scatter(crimes[:,0], crimes[:,1])
	plt.grid(True)
	
	ax.set_xlim([-122.535908,-122.347306])
	ax.set_ylim([37.696850,37.839763])
	
	img = imread("../img/sf.png")
	plt.imshow(img,zorder=0,extent=[-122.535908, -122.347306, 37.696850, 37.839763])
	
	plt.show()

if __name__ == '__main__':
	path = "../data/train.csv"
	data = im.read_labeled(path)
	data = dp.vectorize(data, 1, features=[('latitude', 7), ('longitude', 8)])
	crime_to_id_dict = data.next()
	data = im.to_numpy_array(data)
	id_to_crime_dict = {value: key for key, value in crime_to_id_dict.items()}
	crime_lists = create_crime_lists(data, id_to_crime_dict)

	for cl in crime_lists:
		arr = np.asarray(crime_lists[cl])
		plot_crimes(arr,cl)