-
Notifications
You must be signed in to change notification settings - Fork 0
/
Main.py
116 lines (95 loc) · 3.79 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Lab about decision trees made by Kevin Bollengier
from typing import List, Tuple
import arff
import fileinput
import Dataset
import math
def main():
data_set = parse_arff()
data = get_data(data_set)
attributes = get_attributes(data_set)
id3(data, attributes, -1)
def parse_arff()->dict:
"""
Function that gains data through file input and uses the arff library to parse this into a dictionary.
:return: Returns a dictionary containing the dataset from the arff file.
"""
lines = []
for line in fileinput.input():
lines.append(line)
data = arff.loads("\n".join(lines))
dataset: Dataset = data
return dataset
def get_attributes(dataset: dict)->list:
"""
Function that get the attributes from a parsed arff dataset and which returns the attributes with their vallues
:param dataset: The dataset which is parsed as dict
:return: The attribute values returned as a list of tuples
"""
attribute_values = dataset["attributes"]
return attribute_values
def get_data(dataset_lastcolumn: dict)->list:
"""
Function that gets the data from a parsed arff dataset and returns the data
:rtype: list
:param dataset_lastcolumn: The dataset which is parsed as dict.
:return: The data values returned as multidimensional list
"""
data_values = dataset_lastcolumn["data"]
return data_values
def calc_entropy(dataset: list)->float:
"""
Function that calculates the entropy of a given attribute
:param dataset: Index of listitem and tuple which gives a list of values of that specific attribute
:return: The entropy as float.
"""
amount_value = {}
for value in dataset:
amount_value[value] = amount_value.get(value, 0) + 1
result = 0
for key, value in amount_value.items():
result -= value/len(dataset) * math.log(value / len(dataset), 2)
return result
def calc_inf_gain(dataset: list, attribute: list, entropy: float, index: int)->float:
"""
Function which calculates the information gain given the data and attributes
:param index: Column
:param dataset: The dataset
:param attribute: List containing the attribute values
:param entropy: Entropy calculated on the dataset
:return: The information gain as float value.
"""
infogain = entropy
for value in attribute[1]:
data_sub_set = [row[-1] for row in dataset if row[index] == value]
infogain -= (len(data_sub_set) / len(dataset) * calc_entropy(data_sub_set))
return infogain
def get_highest_ig(dataset: list, entropy: float, attributes: list)->Tuple[str, List[str]]:
"""
Function that returns the name of the best attribute to split on
:param dataset: The dataset
:param entropy: The entropy of the dataset
:param attributes: List of tuples containing the attributes
:return: Tuple of the best attribute
"""
info_gain_per_attr = {}
for attr in range(len(dataset[0]) - 1):
info_gain_per_attr[attr] = calc_inf_gain(dataset, attributes[attr], entropy, attr)
return attributes[max(info_gain_per_attr.keys())]
def split_attr(dataset: List[str], best_attribute: Tuple[str, List[str]], index: int)->List[List[str]]:
splitted_sets = []
for value in best_attribute[1]:
filter_rows = []
for row in dataset:
if row[index] == value:
filter_rows.append(row)
splitted_sets.append(filter_rows)
return splitted_sets
def id3(dataset, attributes, tree_depth):
tree_depth += 1
entropy = calc_entropy([row[i] for row in dataset for i in range(len(row)) if i == len(attributes)-1])
best_attribute = get_highest_ig(dataset, entropy, attributes)
subsets = split_attr(dataset, best_attribute, attributes.index(best_attribute))
# TODO: print the tree
if __name__ == '__main__':
main()