Exemple #1
0
def normalizer(D, A):
   sum = 0.0
   for v in D.get_attributeValues(A):
      D_j = Dataset(None, None, None, D, A, v)
      pr_A = D_j.get_dataSize() / D.get_dataSize()
      sum +=  pr_A * math.log(pr_A, 2)
   sum = -1 * sum
   #print('Normalizer: %.3f' % (sum))
   return sum
Exemple #2
0
def entropy(D, A=None):
   sum = 0.0
   if A:
      for v_j in D.get_attributeValues(A):
         D_j = Dataset(None, None, None, D, A, v_j)
         sum += D_j.get_dataSize() / D.get_dataSize() * entropy(D_j)
      #print('Entropy[%s]: %.3f' % (A, sum))
   else:
      for c_j in D.get_classes():
         if D.pr_c(c_j):
            sum += D.pr_c(c_j) * math.log(D.pr_c(c_j), 2)
      sum = -1 * sum
      #print('Entropy: %.3f' % (sum))
   return sum
Exemple #3
0
   def decision_tree_rec(self, D, A, T, threshold):
      assert D.get_numClasses() > 0
      if D.get_numClasses() == 1:
         print('make T a leaf node with labeled with c');
         classification = D.get_classes().pop()
         print('F**K')
         print(classification)
         num, choice = D.get_num_choice_tuple(D.get_classAttribute(), classification)

         decision = ElementTree.SubElement(T, 'decision')
         decision.set('end', '1')
         decision.set('num', num)
         decision.set('choice', choice)
      elif len(A) == 0:
         print('make T a leaf node labeled with the most frequent class')
         classification = D.get_mostPluralClass()
         num, choice = D.get_num_choice_tuple(D.get_classAttribute(), classification)

         decision = ElementTree.SubElement(T, 'decision')
         decision.set('end', '1')
         decision.set('num', num)
         decision.set('choice', choice)
      else:
         print('contains examples belonging to a mixture of classes')
         A_split = self.select_splitting_attribute(D, A, threshold)
         print('SPLITTING ON %s: ', A_split)
         #A_split = select_splitting_attribute_ratio(D, A, threshold)
         #print('SPLITTING ON RATIO %s: ', A_split)
         if A_split == None:
            decision = ElementTree.SubElement(T, 'decision')
            decision.set('end', '1')
            decision.set('choice', D.get_mostPluralClass())
         else:
            node = ElementTree.SubElement(T, 'node')
            node.set('var', A_split)
            AminusA_split = set()
            for a in A:
               if a != A:
                  AminusA_split.add(a)
            for v in D.get_attributeValues(A_split):
               D_v = Dataset(None, None, None, D, A_split, v)
               if D_v.get_dataSize() > 0:
                  edge = ElementTree.SubElement(node, 'edge')
                  num, var = D_v.get_num_choice_tuple(A_split, v)
                  edge.set('var', var)
                  edge.set('num', num)
                  self.decision_tree_rec(D_v, AminusA_split, edge, threshold)
               else:
                  print('IGNORING %s' % str(v))
Exemple #4
0
 def __init__(self,
       domain_filename, csv_filename, restrictions_filename = None,
       ratio=False
    ):
    print('~' * 20)
    print(csv_filename)
    print(restrictions_filename)
    print(ratio)
    self.tree = None
    self.tree_name = csv_filename[:-4]
    self.tree_name = os.path.splitext(csv_filename)[0]
    self.restricted = False
    if restrictions_filename:
       print('F*****G RESTRICTED')
       self.restricted = True
    else:
       print('NOT RESTRICTED?!?! %s' % restrictions_filename)
    print('~' * 20)
    self.select_splitting_attribute = select_splitting_attribute_default
    if ratio:
       self.select_splitting_attribute = select_splitting_attribute_ratio
    self.trainingSet = Dataset(domain_filename, csv_filename, restrictions_filename)
Exemple #5
0
class DecisionTreeBuilder(object):
   def __init__(self,
         domain_filename, csv_filename, restrictions_filename = None,
         ratio=False
      ):
      print('~' * 20)
      print(csv_filename)
      print(restrictions_filename)
      print(ratio)
      self.tree = None
      self.tree_name = csv_filename[:-4]
      self.tree_name = os.path.splitext(csv_filename)[0]
      self.restricted = False
      if restrictions_filename:
         print('F*****G RESTRICTED')
         self.restricted = True
      else:
         print('NOT RESTRICTED?!?! %s' % restrictions_filename)
      print('~' * 20)
      self.select_splitting_attribute = select_splitting_attribute_default
      if ratio:
         self.select_splitting_attribute = select_splitting_attribute_ratio
      self.trainingSet = Dataset(domain_filename, csv_filename, restrictions_filename)

   def decision_tree_rec(self, D, A, T, threshold):
      assert D.get_numClasses() > 0
      if D.get_numClasses() == 1:
         print('make T a leaf node with labeled with c');
         classification = D.get_classes().pop()
         print('F**K')
         print(classification)
         num, choice = D.get_num_choice_tuple(D.get_classAttribute(), classification)

         decision = ElementTree.SubElement(T, 'decision')
         decision.set('end', '1')
         decision.set('num', num)
         decision.set('choice', choice)
      elif len(A) == 0:
         print('make T a leaf node labeled with the most frequent class')
         classification = D.get_mostPluralClass()
         num, choice = D.get_num_choice_tuple(D.get_classAttribute(), classification)

         decision = ElementTree.SubElement(T, 'decision')
         decision.set('end', '1')
         decision.set('num', num)
         decision.set('choice', choice)
      else:
         print('contains examples belonging to a mixture of classes')
         A_split = self.select_splitting_attribute(D, A, threshold)
         print('SPLITTING ON %s: ', A_split)
         #A_split = select_splitting_attribute_ratio(D, A, threshold)
         #print('SPLITTING ON RATIO %s: ', A_split)
         if A_split == None:
            decision = ElementTree.SubElement(T, 'decision')
            decision.set('end', '1')
            decision.set('choice', D.get_mostPluralClass())
         else:
            node = ElementTree.SubElement(T, 'node')
            node.set('var', A_split)
            AminusA_split = set()
            for a in A:
               if a != A:
                  AminusA_split.add(a)
            for v in D.get_attributeValues(A_split):
               D_v = Dataset(None, None, None, D, A_split, v)
               if D_v.get_dataSize() > 0:
                  edge = ElementTree.SubElement(node, 'edge')
                  num, var = D_v.get_num_choice_tuple(A_split, v)
                  edge.set('var', var)
                  edge.set('num', num)
                  self.decision_tree_rec(D_v, AminusA_split, edge, threshold)
               else:
                  print('IGNORING %s' % str(v))
   def build_tree(self, threshold):
      self.tree = Element('Tree')
      self.tree.set('name',self.tree_name)
      allAttributes = self.trainingSet.get_attributes()
      self.decision_tree_rec(self.trainingSet, allAttributes, self.tree, threshold)
      return self.tree
   def get_tree(self):
      return self.tree
   def get_xml(self, indent='   '):
      return minidom.parseString(
            ElementTree.tostring(self.tree)).toprettyxml(indent=indent)
   def print_tree(self, file=sys.stdout, indent='   '):
      xml_str = self.get_xml(indent)
      print(xml_str, file=file)
      return xml_str
   def save_tree(self, file=None, indent='   '):
      if file:
         return self.print_tree(file, indent)
      else:
         xml_filename = self.tree_name
         if self.restricted:
            xml_filename += '_restricted'
         xml_filename += '.xml'
         with open(self.tree_name + '.xml', 'w') as save_file:
            xml_str = self.print_tree(save_file, indent)
         return xml_str