Ejemplo n.º 1
0
def classify(strong_hypothesis, example):
	classification = 0
	for weight, learner in strong_hypothesis:
		ex_class = 1 if id3.classify(learner, example) == 1 else -1
		classification += weight*ex_class

	return 1 if classification > 0 else 0
def trial_id3(tree, testing_examples):
    """ A single trial on *testing_examples* using *tree*. """
    
    list_classes = []
    for i in range(0, len(testing_examples)):
        feature_vector = testing_examples[i][:-1]
        list_classes.append(id3.classify(tree, feature_vector))
    
    return list_classes
Ejemplo n.º 3
0
def adaboost(training_data, rounds):

	m = len(training_data)
	weights = np.ones(m) * 1.0 / m
	strong_hypothesis = np.zeros(m)
	learners = []
	alphas = []

	attributes = set(range(57))

	for t in range(rounds):

		error = 0.0
		resampled_examples = []
		examples_index = resample(weights, m)

		for i in range(m):
			resampled_examples.append(training_data[examples_index[i]])

		weak_learner = id3.id3_depth_limited(resampled_examples, attributes, 2)
		learners.append(weak_learner)

		#classifications = [(id3.classify(weak_learner, X), y) for X, y in resampled_examples]
		classifications = [(id3.classify(weak_learner, X), y) for X, y in training_data]
		error = 0
		for i in range(len(classifications)):
			predicted, actual = classifications[i]
			error += (predicted != actual)*weights[i]

		print "Error", error

		if error == 0.0:
			alpha = 4.0
		elif error > 0.5:
			break
		else:
			alpha = 0.5 * np.log((1 - error)/error)

		alphas.append(alpha)
		learners.append(weak_learner)

		for i in range(m):
			h, y = classifications[i]
			h = -1 if h == 0 else 1
			y = -1 if y == 0 else 1
			#weights[examples_index[i]] = weights[examples_index[i]] * np.exp(-alpha * h * y)
			weights[i] = weights[i] * np.exp(-alpha * h * y)
		sum_weights = sum(weights)
		print 'Sum of weights', sum_weights
		normalized_weights = [float(w)/sum_weights for w in weights]
		weights = normalized_weights

	return zip(alphas, learners)
Ejemplo n.º 4
0
def ensemble(Xtest, ytest, trees):
	error = 0
	predictions = []

	ensemble_pred = []
	for i in xrange(Xtest.shape[0]):
		example = Xtest[i]
		predictions.append({})
		for j in xrange(len(trees)):
			curr_pred = id3.classify(trees[j],example)
			if curr_pred not in predictions[i]:
				predictions[i][curr_pred] = 1
			else:
				predictions[i][curr_pred] += 1
			
		ensemble_pred.append(max(predictions[i],key = predictions[i].get))	

	for i in xrange(len(ensemble_pred)):
		if ensemble_pred[i] != ytest[i]:
			error +=1
	return float(error) / float(len(Xtest))
Ejemplo n.º 5
0
def ensemble(Xtest, ytest, trees):
    error = 0
    predictions = []

    ensemble_pred = []
    for i in xrange(Xtest.shape[0]):
        example = Xtest[i]
        predictions.append({})
        for j in xrange(len(trees)):
            curr_pred = id3.classify(trees[j], example)
            if curr_pred not in predictions[i]:
                predictions[i][curr_pred] = 1
            else:
                predictions[i][curr_pred] += 1

        ensemble_pred.append(max(predictions[i], key=predictions[i].get))

    for i in xrange(len(ensemble_pred)):
        if ensemble_pred[i] != ytest[i]:
            error += 1
    return float(error) / float(len(Xtest))
Ejemplo n.º 6
0
	if len(sys.argv) > 3:
		n = int(sys.argv[3])
	else:
		n = 100

	print 'Learning...'
	forest = adaboost(Xtrain, ytrain, n, len(Xtrain), 57)
	print 'Classifying...'
	predictions = []

	ensemble_pred = []
	for i in xrange(Xtest.shape[0]):
		example = Xtest[i]
		predictions.append({})
		for j in xrange(len(trees)):
			curr_pred = id3.classify(trees[j],example)
			if curr_pred not in predictions[i]:
				predictions[i][curr_pred] = 1
			else:
				predictions[i][curr_pred] += 1

	print 'Writing to', output_file
	with open(output_file, 'wb') as csvfile:
		filewriter = csv.writer(csvfile, delimiter=' ',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
		filewriter.writerow(['Id', '|' 'Category'])
		for index, classification in enumerate(ensemble_pred):
			filewriter.writerow([index+1, '|', classification])
	print 'Done writing to', output_file

Ejemplo n.º 7
0
 def classify(self,data):
     if not (self.dataSet and self.labels):
         return None
     return id3.classify(self.tree, self.labels, data)
Ejemplo n.º 8
0
        '../data/data-splits/data.test', n_features=n_features, preprocessor=preprocessor)

    cv_data = np.array_split(np.hstack((train_data, train_labels)), 5)

    max_acc = 0
    opt_depth = 0
    for i in range(2, n_features + 2):
        acc = []

        for j in range(len(cv_data)):
            cv_test = cv_data[j]
            cv_train = np.vstack(cv_data[:j] + cv_data[j + 1:])

            tree, depth = id3(cv_train[:, :-1], cv_train[:, -1], max_depth=i)

            cv_acc = evaluate_tree(cv_test[:, :-1], cv_test[:, -1], tree)
            acc.append(cv_acc)

        avg_acc = np.mean(acc)
        if avg_acc > max_acc:
            opt_depth = i
            max_acc = avg_acc

    tree, depth = id3(train_data, train_labels, max_depth=opt_depth)
    train_acc = evaluate_tree(train_data, train_labels, tree)
    test_acc = evaluate_tree(test_data, test_labels, tree)

    write_output('ID3', opt_depth, max_acc, train_acc, test_acc)
    write_predictions('id3', lambda row: classify(row, tree),
                      n_features=n_features, preprocessor=preprocessor)
Ejemplo n.º 9
0
    if len(sys.argv) > 3:
        n = int(sys.argv[3])
    else:
        n = 100

    print 'Learning...'
    forest = adaboost(Xtrain, ytrain, n, len(Xtrain), 57)
    print 'Classifying...'
    predictions = []

    ensemble_pred = []
    for i in xrange(Xtest.shape[0]):
        example = Xtest[i]
        predictions.append({})
        for j in xrange(len(trees)):
            curr_pred = id3.classify(trees[j], example)
            if curr_pred not in predictions[i]:
                predictions[i][curr_pred] = 1
            else:
                predictions[i][curr_pred] += 1

    print 'Writing to', output_file
    with open(output_file, 'wb') as csvfile:
        filewriter = csv.writer(csvfile,
                                delimiter=' ',
                                quotechar='|',
                                quoting=csv.QUOTE_MINIMAL)
        filewriter.writerow(['Id', '|' 'Category'])
        for index, classification in enumerate(ensemble_pred):
            filewriter.writerow([index + 1, '|', classification])
    print 'Done writing to', output_file
Ejemplo n.º 10
0
	test_examples = zip(test_block, test_label_block.T.tolist()[0])

	#cross validation for random forest
	print "Cross Validating Random Forest..."
	train_size = int(training_block.shape[0])
	att_size = int(len(attributes))
	forest_size = 100
	[ensemble_error, ensemble_pred] = rf.ensemble(test_block, test_label_block, rf.raise_forest(training_block,training_label_block, forest_size, train_size, att_size))
	error[0]+= (1.0/k) * ensemble_error

	#cross validation for decision tree
	print "Cross Validating Decision Tree..."
	dec_tree = id3.id3(train_examples, attributes)
	dec_tree_errors = 0
	for i in xrange(len(test_block)):
		if id3.classify(dec_tree, test_block[i]) != test_label_block[i]:
			dec_tree_errors += 1
	error[1] += (1.0/k) * (float(dec_tree_errors) / set_size)

	print "Cross Validating AdaBoost..."
	adaboost_classifier = adaboost.adaboost(train_examples, adaboost_rounds)
	adaboost_errors = 0
	for i in xrange(len(test_block)):
		if adaboost.classify(adaboost_classifier, test_block[i]) != test_label_block[i]:
			adaboost_errors += 1
	error[2] += (1.0/k) * (float(adaboost_errors) / set_size)



print (1-error[0]), (1-error[1])
print 'Estimated accuracy of Random Forest:', (1-error[0])
Ejemplo n.º 11
0
for _ in range(int(len(data) * 0.2)):
    tests.append(data.pop())

# create tree sample
tree = create_tree(data, attrs, sys.argv[2])

# print tree
print_tree(tree)

# test classification
print '\nTesting sampled records:'
good = 0.0
bads = []
for s in tests:
    try:
        r = classify(tree, [s])[0]
        rx = s[sys.argv[2]]
        valid = '[!]' if r != rx else ''
        print '{:4}'.format(s[label]), 'classified as:', r, 'actually is', \
              rx, valid
        if r == rx:
            good += 1.0
    except KeyError:
        bads.append(s[label])

print '--'
if bads:
    print 'Could not classify the following:', ', '.join(bads)
print 'Total accuracy: {:.2f}%, {}/{}'.format(100 * good / len(tests),
                                              int(good), len(tests))
 def predictor(row):
     return mode(list(map(lambda tree: classify(row, tree), trees)))[0][0]