def cross_validate(fold, data): def make_chunks(seq, num_chunks): avg = len(seq) / float(num_chunks) out = [] last = 0.0 while last < len(seq): out.append(seq[int(last):int(last + avg)]) last += avg return out random.shuffle(data) chunks = make_chunks(data, fold) err = [] err_prune = [] for i in range(fold): test_data = chunks[i] train_data = chunks[:i] + chunks[i+1:] root = gen_tree(training_data) err.append(test_accuracy(root, test_data)[1]) err_prune.append(test_accuracy(root, test_data, 2)[1]) return (err , err_prune)
def validation_error(test_data, train_data_orig, validate_ratio): random.shuffle(train_data_orig) # Make validation set split = int(len(train_data_orig) * validate_ratio) validate_data = train_data_orig[:split] train_data = train_data_orig[split:] # Initial tree root = gen_tree(train_data) # Find best pruning least_err = float("inf") best_prune = -1 for i in range(max_depth(root)): err = test_accuracy(root, validate_data, i)[1] if err < least_err: least_err = err best_prune = i # Return err of best pruning against test data return test_accuracy(root, test_data, best_prune)[1]
best_prune = -1 for i in range(max_depth(root)): err = test_accuracy(root, validate_data, i)[1] if err < least_err: least_err = err best_prune = i # Return err of best pruning against test data return test_accuracy(root, test_data, best_prune)[1] training_data = load_training("wifi.train") test_data = load_training("wifi.test") root = gen_tree(training_data) print "\n" err_locs, err_all = test_accuracy(root, training_data) print "TEST ACCURACY ON TRAINING SET:" print "\tLOCATION SPECIFIC ERRORS:" for loc, val in err_locs.iteritems(): print "\t\t{0} : {1}".format(loc,val) print "\tOVERALL ERROR:\n\t\t{0}".format(err_all) print "\n" err_locs, err_all = test_accuracy(root, test_data) print "TEST ACCURACY ON TEST SET:" print "\tLOCATION SPECIFIC ERRORS:"