Esempio n. 1
0
def main():
    # get training file, test file and output file's name from command line argument
    train_file = sys.argv[1]
    test_file = sys.argv[2]
    output_file = sys.argv[3]

    # get training file and make list of columns
    with open(train_file) as f:
        train_data = f.readlines()
    train_data = [d.strip() for d in train_data]

    # make train_data string to attribute list and get each attribute name and total number of columns
    attribute_list = []
    total_attribute = 0
    for line in train_data:
        each_line = line.split("\t")
        attribute_list.append(each_line)
        total_attribute += 1
    attribute_name = attribute_list[0]
    attribute_list.pop(0)
    total_attribute -= 1

    possible_name = {}

    # get possible classes
    for i in range(len(attribute_name)):
        ith_class = []
        for attr in attribute_list:
            if attr[i] not in ith_class:
                ith_class.append(attr[i])
        possible_name[attribute_name[i]] = ith_class

    # get decision class's label and generate tree
    decision_label = attribute_name[len(attribute_name) - 1]
    tree = DecisionTree.GenerateTree(attribute_list, attribute_name)

    # get test input from test file
    with open(test_file) as f:
        test_input = f.readlines()
    test_input = [d.strip() for d in test_input]
    test_input.pop(0)
    test_data = [[]]
    for line in test_input:
        test_data.append(line.split("\t"))
    test_data.remove([])

    # predict decision class and write in output file
    f = open(output_file, "w")
    del attribute_name[attribute_name.index(decision_label)]
    for name in attribute_name:
        f.write(name + "\t")
    f.write(decision_label + "\n")
    for entry in test_data:
        # call decision tree
        tempDict = tree.copy()
        parentDict = tempDict
        rootDict = tree.copy()
        rootDict = rootDict[list(rootDict.keys())[0]]
        result = ""
        # trace tree while the answer is found or tree is end
        while isinstance(tempDict, dict):
            root = TreeNode.TreeNode(
                list(tempDict.keys())[0], tempDict[list(tempDict.keys())[0]])

            tempDict = tempDict[list(tempDict.keys())[0]]
            index = attribute_name.index(root.value)
            value = entry[index]

            if (value in list(tempDict.keys())):
                child = TreeNode.TreeNode(value, tempDict[value])
                result = tempDict[value]
                parentDict = tempDict
                tempDict = tempDict[value]
            # can't find the entry in tree, follow majority vote
            else:
                result = getMaxLabel(parentDict, tempDict, rootDict)
                break
        for i in entry:
            f.write(i + "\t")
        f.write(result + "\n")
    f.close()