def main(): trainset, trainRaw = makeData(training, labels) testset, testRaw = makeData(testing, labels) trainLabels = [item[-1] for item in trainRaw] testLabels = [item[-1] for item in testRaw] #print(gf.gainE(trainset,labels[5], labels[-1])) # mytree = id3(trainset, labels, label_attr, labels[-1], 6, "entropy", None) # printTree(mytree) print("Running the decision tree algorithm on the 'Cars' dataset.") algotype = ['gini', 'entropy', 'ME'] for item in algotype: for i in range(1, 7): currentTree = id3.id3(trainset, labels, label_attr, labels[-1], i, item, None) trainPred = [id3.predict(currentTree, x, labels) for x in trainset] testPred = [id3.predict(currentTree, x, labels) for x in testset] trainAcc = id3.accuracy(trainPred, trainLabels) testAcc = id3.accuracy(testPred, testLabels) print("Decision tree of depth", i, "using", item, "has a test accuracy of", testAcc, 'and a training accuracy of', trainAcc)
def main(): pd.set_option('display.max_columns', None) pd.set_option('display.width', None) train_set = pd.read_csv('data/close1.csv', sep=';') test_set = pd.read_csv('data/close1_test2.csv', sep=';') print train_set.to_html(classes='table') print test_set.to_html(classes='table') tree = {} target_field = 'Доступность' fields = list(train_set.columns.values) fields.remove(target_field) id3.fit(train_set, tree, target_field, fields) # json_str = json.dumps(tree) # print(json_str) print('Train Set Accuracy: {} ({}/{})'.format(*id3.accuracy(tree, train_set, target_field))) print('Test Set Accuracy: {} ({}/{})'.format(*id3.accuracy(tree, test_set, target_field))) # plot_tree(tree, u'Decision Tree') # filter using knn filtered_ts = knn.filter_anomalies(train_set, 'Тип объекта', 3, 0.15) id3.fit(filtered_ts, tree, target_field, fields) # json_str = json.dumps(tree) # print(json_str) print('Train Set Accuracy: {} ({}/{})'.format(*id3.accuracy(tree, filtered_ts, target_field))) print('Test Set Accuracy: {} ({}/{})'.format(*id3.accuracy(tree, test_set, target_field)))
def main(): trainset, trainraw = makeData(training, labels) testset, testraw = makeData(testing, labels) medians = medianAssign(trainset, labels) trainset = removeNums(trainset, medians) testset = removeNums(testset, medians) for element in trainset: element['weight'] = 1 / len(trainset) trainLabels = [item[-1] for item in trainraw] testLabels = [item[-1] for item in testraw] #print(trainset[1]) train = [] test = [] for i in [1, 2, 4, 8, 16, 20, 21]: for element in trainset: element['weight'] = 1 / len(trainset) #currentTree = id3(trainset, labels, label_attr, labels[-1], i, 'entropy', None) treeList = boost(trainset, labels, label_attr, labels[-1], i, trainLabels) trainPred = [] testPred = [] for entry in trainset: trainPred.append( boostGuess(treeList, entry, labels, label_attr['outcome'])) for entry in testset: testPred.append( boostGuess(treeList, entry, labels, label_attr['outcome'])) trainAcc = accuracy(trainPred, trainLabels) if trainAcc < 0.3: trainAcc = 1 - trainAcc testAcc = accuracy(testPred, testLabels) if testAcc < 0.3: testAcc = 1 - testAcc train.append(trainAcc) test.append(testAcc) print("Boosted decision tree with", i, 'iterations, has a training accuracy of', trainAcc) print("Boosted decision tree with", i, 'iterations, has a testing accuracy of', testAcc) showtree = boost(trainset, labels, label_attr, labels[-1], 100, trainLabels) stumpTrainAcc = [] stumpTestAcc = [] for tree in showtree: stumpTrainPred = [] stumpTestPred = [] for entry in trainset: stumpTrainPred.append(predict(tree['tree'], entry, labels)) for entry in testset: stumpTestPred.append(predict(tree['tree'], entry, labels)) stumpTrainAcc.append(accuracy(stumpTrainPred, trainLabels)) # print(stumpTrainAcc[-1]) stumpTestAcc.append(accuracy(stumpTestPred, testLabels)) print('Training stumps', stumpTrainAcc) print('Testing stumps', stumpTestAcc)
def main(): trainset, trainraw = makeData(training, labels) testset, testraw = makeData(testing, labels) medians = medianAssign(trainset, labels) trainset = removeNums(trainset, medians) testset = removeNums(testset, medians) fix = fixUnknown(trainset, labels) trainsetU = replaceUnknown(trainset, labels, fix) testsetU = replaceUnknown(testset, labels, fix) trainLabels = [item[-1] for item in trainraw] testLabels = [item[-1] for item in testraw] #print(trainset[1]) print( "Running decision tree algorithm on the bank dataset with unknown values" ) algotype = ['gini', 'entropy', 'ME'] for item in algotype: for i in range(1, 17): currentTree = id3(trainset, labels, label_attr, labels[-1], i, item, None) trainPred = [predict(currentTree, x, labels) for x in trainset] testPred = [predict(currentTree, x, labels) for x in testset] trainAcc = accuracy(trainPred, trainLabels) testAcc = accuracy(testPred, testLabels) print("Decision tree of depth", i, "using", item, "has a test accuracy of", testAcc, 'and a training accuracy of', trainAcc) print( "Running decision tree algorithm on the bank dataset with unknown's replaced" ) print("\n \n \n \n \n") for item in algotype: for i in range(1, 17): currentTree = id3(trainsetU, labels, label_attr, labels[-1], i, item, None) trainPred = [predict(currentTree, x, labels) for x in trainsetU] testPred = [predict(currentTree, x, labels) for x in testsetU] trainAcc = accuracy(trainPred, trainLabels) testAcc = accuracy(testPred, testLabels) print("Decision tree of depth", i, "using", item, "has a test accuracy of", testAcc, 'and a training accuracy of', trainAcc)
def main(): pd.set_option('display.max_columns', None) pd.set_option('display.width', None) train_set = pd.read_csv('data/close1.csv', sep=';') test_set = pd.read_csv('data/close1_test2.csv', sep=';') print train_set.to_html(classes='table') print test_set.to_html(classes='table') tree = {} target_field = 'Доступность' fields = list(train_set.columns.values) fields.remove(target_field) id3.fit(train_set, tree, target_field, fields) # json_str = json.dumps(tree) # print(json_str) print('Train Set Accuracy: {} ({}/{})'.format( *id3.accuracy(tree, train_set, target_field))) print('Test Set Accuracy: {} ({}/{})'.format( *id3.accuracy(tree, test_set, target_field))) # plot_tree(tree, u'Decision Tree') # filter using knn filtered_ts = knn.filter_anomalies(train_set, 'Тип объекта', 3, 0.15) id3.fit(filtered_ts, tree, target_field, fields) # json_str = json.dumps(tree) # print(json_str) print('Train Set Accuracy: {} ({}/{})'.format( *id3.accuracy(tree, filtered_ts, target_field))) print('Test Set Accuracy: {} ({}/{})'.format( *id3.accuracy(tree, test_set, target_field)))
def main_slow(): trainset, trainraw = makeData(training, labels) testset, testraw = makeData(testing, labels) medians = medianAssign(trainset, labels) trainset = removeNums(trainset, medians) testset = removeNums(testset, medians) for element in trainset: element['weight'] = 1 trainLabels = [item[-1] for item in trainraw] testLabels = [item[-1] for item in testraw] treenums = [1, 100, 200, 300, 500, 800, 1000] for k_val in [2, 4, 6]: print("When you set your attribute subset values to", k_val, "you get the following.") trainAcc = [] testAcc = [] for num in treenums: treelist = [] for i in range(num): newTraining = rand.choices(trainset, k=len(trainset)) newTree = rand_id3(newTraining, labels, label_attr, labels[-1], 18, 'entropy', None, k_val) treelist.append(newTree) trainPred = [] testPred = [] for entry in trainset: thing = bag_guess(treelist, entry, labels, label_attr['outcome']) trainPred.append(thing) for entry in testset: thing = bag_guess(treelist, entry, labels, label_attr["outcome"]) testPred.append(thing) trainAcc.append(accuracy(trainPred, trainLabels)) testAcc.append(accuracy(testPred, testLabels)) print(trainAcc, 'train accuracy') print(testAcc, 'test accuracy') tree_preds = [] basics = [] for i in range(100): train_i = rand.choices(trainset, k=1000) treelist_i = [] for j in range(1000): train_j = rand.choices(train_i, k=1000) newTree = rand_id3(train_j, labels, label_attr, labels[-1], 18, 'entropy', None, 6) treelist_i.append(newTree) # if j%100 == 0: # print("100 more trees from set", i, 'have been trained. iteration = ',j) tree_preds.append(treelist_i) basics.append(treelist_i[0]) print("Tree set", i, "has been trained") singleVar = [] singleBias = [] singleMean = [] for entry in testset: guess_agg = 0 predictions = [] for tree in basics: guess = predict(tree, entry, labels) if guess == label_attr['outcome'][0]: guess_agg += 1 predictions.append(1) else: predictions.append(0) ave = guess_agg / len(basics) singleMean.append(ave) value = 0 if entry['outcome'] == label_attr['outcome'][0]: value = 1 bias = (value - ave)**2 singleBias.append(bias) subVar = [] for h in predictions: mini = (h - ave)**2 subVar.append(mini) var = (1 / (len(basics) - 1)) * sum(subVar) singleVar.append(var) bagVar = [] bagBias = [] bagMean = [] for entry in testset: guess_agg = 0 predictions = [] for trees in tree_preds: guess = bag_guess(trees, entry, labels, label_attr['outcome']) if guess == label_attr['outcome'][0]: guess_agg += 1 predictions.append(1) else: predictions.append(0) ave = guess_agg / len(basics) bagMean.append(ave) value = 0 if entry['outcome'] == label_attr['outcome'][0]: value = 1 bias = (value - ave)**2 bagBias.append(bias) subVar = [] for h in predictions: mini = (h - ave)**2 subVar.append(mini) var = (1 / (len(basics) - 1)) * sum(subVar) bagVar.append(var) sVariance = mean(singleVar) sBias = mean(singleBias) sMSE = sBias + sVariance print("The bias and the variance of the single trees are: Variance:", sVariance, 'Bias:', sBias, "and the general squared error is:", sMSE) bVariance = mean(bagVar) bBias = mean(bagBias) bMSE = bBias + bVariance print("The bias and the variance of the bagged trees are: Variance:", bVariance, 'Bias:', bBias, "and the general squared error is:", bMSE)
step_size = len(trainset[0]) // 10 for length in range(10, upper_limit, step_size): print('Number of Training Instances:', length) outputfile_tr.write('Number of Training Instances: ' + str(length) + '\n') pruned_accuracies = [] unpruned_accuracies = [] for experiment in range(5): train = trainset[experiment][:length] test = testset[experiment] tree = id3.ID3(train, default) id3.prune(tree, validation_set) acc = id3.accuracy(tree, test) pruned_accuracies.append(acc) tree = id3.ID3(train, default) acc = id3.accuracy(tree, test) unpruned_accuracies.append(acc) avg_pruned_accuracies = sum(pruned_accuracies) / len(pruned_accuracies) avg_unpruned_accuracies = sum(unpruned_accuracies) / len( unpruned_accuracies) print(' Accuracy for Pruned tree: ' + str(avg_pruned_accuracies)) print('Accuracy for Unpruned tree: ' + str(avg_unpruned_accuracies)) outputfile_tr.write(' Accuracy for Pruned tree: ' + str(avg_pruned_accuracies) + '\n')