Exemple #1
0
def main():
    trainSetFilename, testSetFilename, m = getArgs()
    trainData = arffParser.parse(trainSetFilename)
    testData = arffParser.parse(testSetFilename)

    data = trainData['data']
    attr = trainData['attributes']
    targetIndex = len(trainData['attributes']) - 1
    tree = DecisionTree(data, attr, targetIndex, m)

    if tree.root == None:
        return
    tree.printTree()
    print('<Predictions for the Test Set Instances>')
    predictedData = []
    for row in testData['data']:
        predictedClass = tree.classify(row, testData['attributes'])
        predictedRow = row[-1:] + [predictedClass]
        predictedData.append(predictedRow)

    numCorrect = 0
    for i, row in zip(list(range(1, len(predictedData) + 1)), predictedData):
        print('{0}: Actual: {1} Predicted: {2}'.format(i, row[0], row[1]))
        if row[0] == row[1]:
            numCorrect += 1
    print('Number of correctly classified: ' + str(numCorrect),
          'Total number of test instances: ' + str(len(predictedData)))
Exemple #2
0
def main():

    # Check if the number of command line arguments is correct
    if len(sys.argv) < 6:
        print(
            "-- python --|-- main.py --|-- L --|-- K --|-- training_set.csv --|-- validation_set.csv --|-- test_set.csv --|"
        )
        sys.exit(1)

    # The program takes two integer L and K as input to prune the decision tree
    L = int(sys.argv[1])
    K = int(sys.argv[2])

    # Get the file path of the training data, validation data and test data
    dataDir = './data1/'
    training_set = dataDir + sys.argv[3]
    validation_set = dataDir + sys.argv[4]
    test_set = dataDir + sys.argv[5]

    # Build a decision tree on training data
    decisionTree = DecisionTree(training_set)

    #############################################
    # decisionTree.exportTree('tree.txt')
    # print decisionTree
    #############################################

    # Create a validator using test data to calculate the prediction accuracy of a given decision tree
    validator = Validator(test_set)

    # Calculate the prediction accuracy of the original decision tree on test data
    validator.calculateAccuracy(decisionTree.root)

    # Display the prediction accuracy before pruning
    print("\nA decision tree is fully grown to fit the training data.")
    validator.displayAccuracy()

    # Post pruning the decision tree
    print("\nPost prunning", '.' * 30)
    print("L =", L, ", K =", K, ", the pruned decision tree is:\n")

    # Prune the original decision tree using L, K and validation data as inputs
    decisionTree.pruneTree(L, K, validation_set)

    ##############################################
    # decisionTree.exportTree('pruned_tree.txt')
    ##############################################

    # print the decision tree to standard output
    print(decisionTree)  # Override the __str__ method in DecisionTree class

    # Calculate the prediction accuracy of the pruned decision tree on test data
    validator.calculateAccuracy(decisionTree.root)

    # Display the prediction accuracy after pruning
    validator.displayAccuracy()
 def train(self, data, labels):
     self.data = data
     self.labels = labels
     for i in range(self.num_trees):
         sample_index = np.random.choice(self.data.shape[0],
                                         self.num_sample,
                                         replace=True)
         train_data = self.data[sample_index, :]
         train_labels = self.labels[sample_index]
         tree = DecisionTree(self.max_depth, self.num_feature)
         tree.train(train_data, train_labels)
         self.trees.append(tree)
Exemple #4
0
    def test_all_file(self):
        options = {
            'df': pd.read_csv("benchmark.csv", sep=';'),
            'label_column': "Joga"
        }
        tr = DecisionTree()
        model = tr.train(options)

        for _, row in options['df'].iterrows():
            target_label = row["Joga"]
            predicted = model.predict(row.drop("Joga"))
            self.assertEqual(target_label, predicted)
Exemple #5
0
    def test_benchmark(self):
        options = {
            'df': pd.read_csv("benchmark.csv", sep=';'),
            'label_column': "Joga"
        }
        tr = DecisionTree()
        model = tr.train(options)

        inf_data = pd.Series(
            ["Ensolarado", "Quente", "Normal", "Verdadeiro"],
            index=["Tempo", "Temperatura", "Umidade", "Ventoso"],
            name="InferenceData")
        self.assertEqual(model.predict(inf_data), 'Sim')
Exemple #6
0
    def testSimpleCase(self):
        tree = DecisionTree()
        tree.fit(simpleData)

        for datum in simpleData:
            self.assertEqual(datum[-1], tree.predict(datum))

        tree.print()
    def fit(self, dataset):
        self.forest = []
        for i in range(self.num_trees):
            # Generate a random subset of the dataset to train the tree on
            subset = [
                dataset[random.randrange(0, len(dataset))]
                for a in range(self.num_samples)
            ]
            self.forest.append(DecisionTree(self.max_depth).fit(subset))

        return self
Exemple #8
0
    def train(self, options):
        """
        train a random forest, using n_trees decision trees
        options['df']: pandas dataframe
        options['n_trees']: number of trees
        options['label_column']: label column to be predicted
        options['bootstrap_size']: the size of the bootstrap, entries not used in the bootstrap will be ignored
        """
        num_trees = options['n_trees']
        df = options['df']
        bootstrap_size = options['bootstrap_size']

        tree_options = {
            'label_column': options['label_column']
        }
        for i in range(num_trees):
            tree_options['df'] = get_bootstrap(df, bootstrap_size)
            new_tree = DecisionTree()
            self.ensemble.append(new_tree.train(tree_options))

        return self
Exemple #9
0
cat_data_test = np.array(cat_data_test).T
cat_data_test = np.array(cat_data_test, dtype='float')

# zip categorical and non-categorical data together

train_data = np.concatenate((cat_data, non_cat_data), axis=1)
train_label = data[:, -1].astype(int)
validation_data = train_data[:200, :]
validation_label = train_label[:200]
train_data = train_data[:, :]
train_label = train_label[:]
test_data = np.concatenate((cat_data_test, non_cat_data_test), axis=1)


# decision tree
tree = DecisionTree(5, train_data.shape[0])
tree.train(train_data, train_label)
res = tree.predict(validation_data)
score = 0
for i in range(len(res)):
    if res[i] == validation_label[i]:
        score += 1
score /= len(res)
print(score)


# random forest

forest = RandomForest(100,5,train_data.shape[0],6)
forest.train(train_data, train_label)
res = forest.predict(validation_data)
Exemple #10
0
    label2id = {'loss': 0, 'draw': 1, 'win': 2}
    p2f = {'b': 0, 'x': 1, 'o': 2}
    for line in fin:
        line = line.strip().split(',')
        label = label2id[line[-1]]
        feature = np.array([p2f[p] for p in line[:-1]])
        data.append((feature, label))
    fin.close()
    return data


data = read('connect-4.data')
x = np.array([d[0] for d in data])
y = np.array([d[1] for d in data])
#y = label_binarize(y, classes=list(range(3)))
kf = KFold(5, True)
all_f1 = []
for train_index, test_index in kf.split(x):
    x_train, y_train, x_test, y_test = x[train_index], y[train_index], x[
        test_index], y[test_index]

    #x_train, x_test, y_train, y_test = train_test_split(x, y)
    #print('training')
    #model = OneVsRestClassifier(SVC(kernel='rbf'))
    model = DecisionTree()
    model.fit(x_train, y_train)
    #print('testing')
    y_pred = model.predict(x_test)
    all_f1.append(f1_score(y_test, y_pred, average='macro'))
print(sum(all_f1) / 5)
Exemple #11
0
__author__ = 'zephyrYin'

from decisionTree import DecisionTree

dT = DecisionTree('data/featnames.csv', 'data/trainfeat.csv', 'data/trainlabs.csv', 'data/testfeat.csv', 'data/testlabs.csv', 0.01)

#dT = DecisionTree('data/weatherFeatureName.csv', 'data/weatherTrainFeature.csv', 'data/weatherTrainLabel.csv', 'data/weatherTrainFeature.csv', 'data/weatherTrainLabel.csv', 1)
dT.buildTree()
dT.predictTestSet()

result = dT.evaluate(dT.contrastResult[0], dT.contrastResult[1])
print(result)
print(str(dT.nodeCnt) + ' nodes')

# dT.readTrainFeatures()
# wholeCnt = len(dT.testFeatures)
# posCnt = dT.countPositive(dT.testFeatures)
# print(wholeCnt)
# print(posCnt)
# print(posCnt/float(wholeCnt))

# dT.readFeatureNames()
# dT.readTrainFeatures()
# dT.readTestFeatures()
#
# for testFea in dT.testFeatures:
#     for i in range(len(testFea)-1):
#         if testFea[i] not in dT.featureValue[i]:
#             print i
#             print(str(testFea[i]) + ' not in ' + str(dT.featureValue[i]))
#
Exemple #12
0
cat_data_test = np.array(cat_data_test, dtype='int')


# zip categorical and non-categorical data together
train_data = np.concatenate((cat_data, non_cat_data), axis=1)
train_label = data[:, -1].astype(int)
validation_data = train_data[:6000, :]
validation_label = train_label[:6000]
train_data = train_data[6000:,:]
train_label = train_label[6000:]
test_data = np.concatenate((cat_data_test,non_cat_data_test), axis=1)

# plot accuracy
accuracy = []
for i in range(40):
    tree = DecisionTree(i,105)
    tree.train(train_data,train_label)
    res = tree.predict(validation_data)
    score = 0
    for i in range(len(res)):
        if res[i] == validation_label[i]:
            score += 1
    score /= len(res)
    accuracy.append(score)

plt.plot(accuracy)
plt.xlabel('depth')
plt.ylabel('accuracy')
plt.title('Accuracy vs. Decision Tree Depth')
plt.savefig('p6.png')
plt.show()
Exemple #13
0
    def testComplexCase(self):
        tree = DecisionTree()
        tree.fit(complexData)

        for datum in complexData:
            self.assertEqual(datum[-1], tree.predict(datum))

        # Overcast = YES
        self.assertEqual(
            Label.YES,
            tree.predict(
                [Outlook.Overcast, Temperature.Hot, Humidity.High, Wind.Weak]))
        self.assertEqual(
            Label.YES,
            tree.predict([
                Outlook.Overcast, Temperature.Cool, Humidity.Normal,
                Wind.Strong
            ]))

        # Sunny + Normal = YES
        self.assertEqual(
            Label.YES,
            tree.predict([
                Outlook.Sunny, Temperature.Cool, Humidity.Normal, Wind.Strong
            ]))
        self.assertEqual(
            Label.YES,
            tree.predict(
                [Outlook.Sunny, Temperature.Hot, Humidity.Normal, Wind.Weak]))

        # Sunny + High = NO
        self.assertEqual(
            Label.NO,
            tree.predict(
                [Outlook.Sunny, Temperature.Cool, Humidity.High, Wind.Strong]))
        self.assertEqual(
            Label.NO,
            tree.predict(
                [Outlook.Sunny, Temperature.Hot, Humidity.High, Wind.Weak]))

        # Rain + Weak = Yes
        self.assertEqual(
            Label.YES,
            tree.predict(
                [Outlook.Rain, Temperature.Cool, Humidity.Normal, Wind.Weak]))
        self.assertEqual(
            Label.YES,
            tree.predict(
                [Outlook.Rain, Temperature.Hot, Humidity.High, Wind.Weak]))

        # Rain + Strong = No
        self.assertEqual(
            Label.NO,
            tree.predict(
                [Outlook.Rain, Temperature.Cool, Humidity.Normal,
                 Wind.Strong]))
        self.assertEqual(
            Label.NO,
            tree.predict(
                [Outlook.Rain, Temperature.Hot, Humidity.High, Wind.Strong]))

        tree.print()
Exemple #14
0
 def run(self):
     dt = DecisionTree(self.pathname, self.G)
     dt.dtree()
Exemple #15
0
    f_score = f1_score(y_test, pred, pos_label='grapefruit')
    # print('Confusion matrix:\n', cm)
    # print('Accuracy: {}\nPrecision: {}\nRecall: {}\nF1_score: {}'.format(
    #     acc, precision, recall, f_score))
    return acc, precision, recall, f_score


# load data
df = pd.read_csv("citrus.csv")

# split columns
X = df.drop("name", axis=1)
y = df["name"].values

# initialize classifiers
clf1 = DecisionTree()
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
clf4 = GaussianNB()

test_cases = [[clf1], [clf2], [clf3], [clf4], [clf1, clf2, clf3, clf4],
              [clf2, clf3, clf4]]

test_number = 1
for classifiers in test_cases:

    accuracy = []
    precision = []
    recall = []
    f_score = []
Exemple #16
0
def test():
	X = [[1, 2, 0, 1, 0],
		 [0, 1, 1, 0, 1],
		 [1, 0, 0, 0, 1],
		 [2, 1, 1, 0, 1],
		 [1, 1, 0, 1, 1]]
	y = ['yes','yes','no','no','no']

	decision_tree = DecisionTree(mode = 'C4.5')

	decision_tree.fit(X,y)

	res = decision_tree.predict(X)
	print res
	
	model_name = 'test.dt'
	decision_tree.saveModel(model_name)
	
	new_tree = DecisionTree()
	new_tree.loadModel(model_name)
	print new_tree.predict(X)
import matplotlib.pyplot as plt
import numpy as np

from loadData import LoadData
from decisionTree import Node, DecisionTree, Evaluate
from inspection import Inspection

if __name__ == '__main__':
    train_input = '../handout/education_train.tsv'
    test_input = '../handout/education_test.tsv'
    train_output = '../result/education_train.labels'
    test_output = '../result/education_test.labels'

    ld = LoadData()
    dataset = ld.load_data(train_input)
    dt = DecisionTree(ld)
    tr_err = []
    te_err = []
    x_arr = []
    print(ld.head)
    for i in range(len(ld.head)):
        root = dt.construct(dataset, i)
        # dt.traverse(root)
        dt.classify(ld.load_data(train_input), root, train_output)
        dt.classify(ld.load_data(test_input), root, test_output)
        with open(train_output, 'r') as f:
            predcol = f.read().splitlines()
        realcol = np.loadtxt(train_input,
                             dtype=str,
                             delimiter='\t',
                             skiprows=1)[:, -1]
from decisionTree import DecisionTree
from decisionTreePlot import DecisionTreePlot

if __name__ == "__main__":
    trainingSetFile = open('SampleSets/training_set.csv')
    trainingSetData = trainingSetFile.readlines()
    trainingSetFile.close()
    decisionTree = DecisionTree(trainingSetData)
    #Algorithm Type - Example
    decisionTree.C45()
    #decisionTree.ID3()
    #decisionTree.ID3K(2)
    #decisionTree.SID3()
    #decisionTree.LSID3(3)
    #decisionTree.LSID3PathSample(2)
    #decisionTree.LSID3MC(1, 0.1)
    #decisionTree.BLSID3(1)
    #decisionTree.BLSID3PathSample(1)
    #decisionTree.LSID3Sequenced(2)
    #decisionTree.IIDT(10, 0.5)

    print("****Tree Data BEFORE Pruning****")
    print("Tree Size - Number of Nodes:", decisionTree.size())
    print("Number of Leafs:", decisionTree.getNumLeafs())
    print("Tree Depth:", decisionTree.getTreeDepth())
    testSetFile = open('SampleSets/test_set.csv')
    testSetData = testSetFile.readlines()
    testSetFile.close()
    print("Prediction:", str(decisionTree.predict(testSetData) * 100) + "%")
    #decisionTreePlot = DecisionTreePlot()
    #decisionTreePlot.createDecisionTreePlot(decisionTree)
Exemple #19
0
from fileIO import FileIO
from preprocess import Preprocessing
from decisionTree import DecisionTree


if __name__ == '__main__':
    filename = 'house-votes-84.data.txt'
    fileio = FileIO()
    data = fileio.read_csv(filename)

    preprocessing = Preprocessing()
    preprocessing.assume_missing_values(data)
    for percent in range(3, 8):
        training_data, testing_data = preprocessing.split_into_training_and_testing(data, percent/float(10))
        attributes_number = len(training_data[0]) - 1
        decision_tree = DecisionTree()
        root_node = decision_tree.build(training_data)
        # decision_tree.print()
        # print("Classification: ")
        accuracy = 0
        for row in testing_data:
            classified = decision_tree.classify(row, decision_tree.root)
            classified.calc_percentages(len(testing_data))
            if classified.republicans_percent > 50.0 and row[0] == 'republican' or (
                    classified.democrats_percent > 50.0 and row[0] == 'democrat'):
                accuracy += 1

        accuracy = accuracy / float(len(testing_data))
        print("Accuracy using training data", percent/float(10)*100, "% is: ", accuracy)
Exemple #20
0
    std_dev = 0
    for a in accuracies:
        std_dev += ((a - average_accuracy)**2)
    std_dev = math.sqrt(1 / len(accuracies) * std_dev)

    print("Accuracy: {}, Std dev: {}".format(average_accuracy, std_dev))
    return (average_accuracy, std_dev)


"""
Main program. Uses decisionTree.py as a support program to classify data
"""

values = [list(a) for a in skset.data]
targets = [int(a) for a in skset.target]
dataset = [Data(values[i], targets[i]) for i in range(len(values))]
# print([str(a) for a in dataset])
# dataset = d.processFile('data/digitsModified.txt')

if algorithm == Algorithm.DECISION_TREE:
    d = DecisionTree(max_depth=max_depth)
    # tree = d.fit(datase)
    check_accuracy(dt=d, dataset=dataset, num_repeats=10)

elif algorithm == Algorithm.RANDOM_FOREST:
    forest = Forest(max_depth=max_depth,
                    num_trees=num_trees,
                    num_samples=num_samples)
    # f = forest.fit(dataset)
    check_accuracy(dt=forest, dataset=dataset, num_repeats=10)