Example #1
0
def main(argv):
	# Load a dataset containing many instances each with a set of attributes and a target value.
	if len(argv) >= 2:
		fileOrURL = argv[1]
		isFile = False if fileOrURL.startswith('http') else True
		if not isFile:
			# Lets open the data from the url that has the data in a csv file format
			# fileOrURL = "http://archive.ics.uci.edu/ml/machine-learning-databases/iris/bezdekIris.data"
			f = io.StringIO(urllib.request.urlopen(fileOrURL).read().decode('utf-8')[:-1])

		csv = numpy.genfromtxt(fileOrURL, delimiter=",", dtype=str)
		numcols = len(csv[0])
		# print ("CSV: ", csv)
		data = csv[:, :-1]
		targets = csv[:,-1]
	else:
		# Please use the popular Iris dataset (natively in scikit-learn).
		iris = datasets.load_iris()
		data = iris.data
		targets = iris.target

	# Randomize the order of the instances in the dataset. Don't forget that you need to keep the targets matched up with the approprite instance.
	# This permutation will be used for both the data and the target so it will line up correctly.
	perm = numpy.random.permutation(len(data))

	data = data[perm]
	targets = targets[perm]
	# Split the data into two sets: a training set (70%) and a testing set (30%)
	# Index of where to split (we want this to be an integer)
	index = int(round(perm.size*.3))
	test = perm[:index]
	train = perm[index:]

	# Instantiate your new classifier
	# classifier = HCClassifier()
	# classifier = KNNClassifier(3)
	classifier = TreeClassifier()
	# "Train" it with data
	print ("Last Parameter: ", argv[-1])
	if argv[-1] is not 'True':
		# Before we make the tree we should make sure the data is simple (only a few bins per column)
		# print ("Data before simplify: ", data)
		data = csvSimplifier.simplify(data)
		# print ("Data after simplify: ", data)
	classifier.train(data[train], targets[train])
	# Make "Predictions" on the test data
	predictions = classifier.predict(data[test])
	# Reset correct answers
	correct = 0
	# Count the answers that we got right
	for (prediction, actual) in zip(predictions, test):
		# print ("Prediction: ", prediction)
		# print ("Actual: ", targets[actual])
		if prediction == targets[actual]:
			correct += 1
	#Determine the accuracy of your classifier's predictions (reported as percentage)
	print (correct/test.size*100)
	# Create new public repository at GitHub and publish code
	# Github repository: http://github.com/DanielRMiller/cs450tree
	return
csv = numpy.genfromtxt(csvFile, delimiter=",", dtype=str)
numcols = len(csv[0])
data = csv[:, :-1]
targets = csv[:,-1]
# Randomize the order of the instances in the dataset. Don't forget that you need to keep the targets matched up with the approprite instance.
# This permutation will be used for both the data and the target so it will line up correctly.
perm = numpy.random.permutation(len(data))

data = data[perm]
targets = targets[perm]
# Split the data into two sets: a training set (70%) and a testing set (30%)
# Index of where to split (we want this to be an integer)
index = int(round(perm.size*.3))
test = perm[:index]
train = perm[index:]
# Instantiate your new classifier
classifier = TreeClassifier()
# "Train" it with data
# print ("Test Data: ", data[train])
classifier.train(data[train], targets[train], True)
# print ("Root: ", classifier.root.attribute)
# Make "Predictions" on the test data
predictions = classifier.predict(data[test])
# Reset correct answers
correct = 0
# Count the answers that we got right
for (prediction, actual) in zip(predictions, test):
	if prediction == targets[actual]:
		correct += 1
#Determine the accuracy of your classifier's predictions (reported as percentage)
print (correct/test.size*100)