def normalizer(D, A): sum = 0.0 for v in D.get_attributeValues(A): D_j = Dataset(None, None, None, D, A, v) pr_A = D_j.get_dataSize() / D.get_dataSize() sum += pr_A * math.log(pr_A, 2) sum = -1 * sum #print('Normalizer: %.3f' % (sum)) return sum
def entropy(D, A=None): sum = 0.0 if A: for v_j in D.get_attributeValues(A): D_j = Dataset(None, None, None, D, A, v_j) sum += D_j.get_dataSize() / D.get_dataSize() * entropy(D_j) #print('Entropy[%s]: %.3f' % (A, sum)) else: for c_j in D.get_classes(): if D.pr_c(c_j): sum += D.pr_c(c_j) * math.log(D.pr_c(c_j), 2) sum = -1 * sum #print('Entropy: %.3f' % (sum)) return sum
def decision_tree_rec(self, D, A, T, threshold): assert D.get_numClasses() > 0 if D.get_numClasses() == 1: print('make T a leaf node with labeled with c'); classification = D.get_classes().pop() print('F**K') print(classification) num, choice = D.get_num_choice_tuple(D.get_classAttribute(), classification) decision = ElementTree.SubElement(T, 'decision') decision.set('end', '1') decision.set('num', num) decision.set('choice', choice) elif len(A) == 0: print('make T a leaf node labeled with the most frequent class') classification = D.get_mostPluralClass() num, choice = D.get_num_choice_tuple(D.get_classAttribute(), classification) decision = ElementTree.SubElement(T, 'decision') decision.set('end', '1') decision.set('num', num) decision.set('choice', choice) else: print('contains examples belonging to a mixture of classes') A_split = self.select_splitting_attribute(D, A, threshold) print('SPLITTING ON %s: ', A_split) #A_split = select_splitting_attribute_ratio(D, A, threshold) #print('SPLITTING ON RATIO %s: ', A_split) if A_split == None: decision = ElementTree.SubElement(T, 'decision') decision.set('end', '1') decision.set('choice', D.get_mostPluralClass()) else: node = ElementTree.SubElement(T, 'node') node.set('var', A_split) AminusA_split = set() for a in A: if a != A: AminusA_split.add(a) for v in D.get_attributeValues(A_split): D_v = Dataset(None, None, None, D, A_split, v) if D_v.get_dataSize() > 0: edge = ElementTree.SubElement(node, 'edge') num, var = D_v.get_num_choice_tuple(A_split, v) edge.set('var', var) edge.set('num', num) self.decision_tree_rec(D_v, AminusA_split, edge, threshold) else: print('IGNORING %s' % str(v))