def __init__(self, dataMatrix, classLabelIndex=None):
        if not dataMatrix:
            raise ValueError('The data should not be empty')
        self._dataMatrix = dataMatrix
        self.decisionTree = DecisionTree()

        if not classLabelIndex:
            self._classLabelIndex = len(dataMatrix[0]) - 1
        else:
            self._classLabelIndex = classLabelIndex

        self._rowIndicesSet = set(range(len(dataMatrix)))
        self._colIndicesSet = set(range(len(
            dataMatrix[0]))) - {self._classLabelIndex}
  def __init__(self, dataMatrix, classLabelIndex = None):
    if not dataMatrix:
      raise ValueError('The data should not be empty')
    self._dataMatrix = dataMatrix
    self.decisionTree = DecisionTree()

    if not classLabelIndex:
      self._classLabelIndex = len(dataMatrix[0]) - 1
    else:
      self._classLabelIndex = classLabelIndex

    self._rowIndicesSet = set(range(len(dataMatrix)))
    self._colIndicesSet = set(range(len(dataMatrix[0]))) - {self._classLabelIndex}
class DecisionTreeBuilder(object):
  """Contains dataset used to train & build a decisiontree,
     Also contains the decision tree that is learnt"""

  __slots__ = '_dataMatrix', 'decisionTree', '_classLabelIndex', '_rowIndicesSet', '_colIndicesSet'

  def __init__(self, dataMatrix, classLabelIndex = None):
    if not dataMatrix:
      raise ValueError('The data should not be empty')
    self._dataMatrix = dataMatrix
    self.decisionTree = DecisionTree()

    if not classLabelIndex:
      self._classLabelIndex = len(dataMatrix[0]) - 1
    else:
      self._classLabelIndex = classLabelIndex

    self._rowIndicesSet = set(range(len(dataMatrix)))
    self._colIndicesSet = set(range(len(dataMatrix[0]))) - {self._classLabelIndex}

  def build(self, maxDepth = 1, rowIndicesSet = None, colIndicesSet = None, root = None):
    if not rowIndicesSet:
      rowIndicesSet = self._rowIndicesSet
    if not colIndicesSet:
      colIndicesSet = self._colIndicesSet
    if maxDepth < 1:
      raise ValueError('Depth should atleast be 1')

    bestAttrResult = self.getBestAttribute(rowIndicesSet, colIndicesSet)

    conditionAttrIndex = bestAttrResult[0]
    countMap = bestAttrResult[1]
    branchSplitMap = {}
    count = 0

    # To ensure proper order for children
    for attrValue in sorted(countMap[conditionAttrIndex]):
      branchSplitMap[attrValue] = count
      count += 1

    classLabel = self.getClassLabel(countMap[conditionAttrIndex])
    decisionElement = DecisionTree.DecisionElement(conditionAttrIndex, branchSplitMap, classLabel)

    if not root:
      new_root = self.decisionTree.add_root(decisionElement)
    else:
      new_root = self.decisionTree.add_child(root, decisionElement)

    hasZeroEntropy = True
    label = None
    for index in rowIndicesSet:
      if not label:
        label = self._dataMatrix[index][self._classLabelIndex]
      else:
        if label != self._dataMatrix[index][self._classLabelIndex]:
          hasZeroEntropy = False
          break

    # Check if the building the table should be continued
    # 1. Zero entropy
    # 2. has the maxDepth reached
    if hasZeroEntropy or self.decisionTree.height() >= maxDepth:
      return new_root

    # Inserting children in proper order
    for attrValue in sorted(countMap[conditionAttrIndex]):

      nextColIndicesSet = colIndicesSet - {conditionAttrIndex}
      nextRowIndicesSet = countMap[conditionAttrIndex][attrValue]['#']

      # 3. more features and rows remain to be processed
      if len(nextColIndicesSet) > 0 and len(nextRowIndicesSet) > 0:
        self.build(maxDepth, nextRowIndicesSet, nextColIndicesSet, new_root)

    return new_root

  def getBestAttribute(self, rowIndicesSet, colIndicesSet):
    if type(rowIndicesSet) is not set or type(colIndicesSet) is not set:
      raise TypeError('This function expects sets of row and column indices')
    if not rowIndicesSet or not colIndicesSet:
      raise ValueError('The sets should contains atleast one index')

    # If only one attribute is remaining


    countMap = self.prepareCountMap(rowIndicesSet, colIndicesSet)
    entropy = 1.0
    for attribute in countMap:
      new_entropy = self.entropy(countMap[attribute])
      if entropy >= new_entropy:
        entropy = new_entropy
        bestAttr = attribute
    return (bestAttr, countMap)

  def prepareCountMap(self, rowIndicesSet, colIndicesSet):
    countMap = {}
    for i in colIndicesSet:
      countMap[i]={}
    for rowNum in rowIndicesSet:
      for attrIndex in colIndicesSet:

        attrValue = self._dataMatrix[rowNum][attrIndex]
        classLabelValue = self._dataMatrix[rowNum][self._classLabelIndex]

        if countMap[attrIndex].has_key(attrValue):
          if countMap[attrIndex][attrValue].has_key(classLabelValue):
            countMap[attrIndex][attrValue][classLabelValue] += 1.0
          else:
            countMap[attrIndex][attrValue][classLabelValue] = 1.0
          countMap[attrIndex][attrValue]['#'].add(rowNum)
        else:
          countMap[attrIndex][attrValue] = {}
          countMap[attrIndex][attrValue][classLabelValue] = 1.0
          countMap[attrIndex][attrValue]['#'] = set([rowNum])
    return countMap

  def entropy(self, attrMap):
    entropy = 0.0
    countPerValue = 0.0
    for attrValueMap in attrMap:
      for classLabel in attrMap[attrValueMap]:
        if classLabel != '#':
          countPerValue += attrMap[attrValueMap][classLabel]

    for attrValueMap in attrMap:
      entropy += self.entropy_contribution(attrMap[attrValueMap], countPerValue)
    return entropy

  def entropy_contribution(self, attrValueMap, countPerAttr):
    valueEntropy = 0.0
    countPerAttrValue = 0.0
    #counts the countPerAttrValue
    for classLabel in attrValueMap:
      if classLabel != '#':
        countPerAttrValue += attrValueMap[classLabel]
    #calculating weighted average for entropy
    for classLabel in attrValueMap:
      if classLabel != '#':
        valueEntropy += -(attrValueMap[classLabel] / countPerAttrValue * \
                        log(attrValueMap[classLabel] / countPerAttrValue, 2)) * \
                        (countPerAttrValue/countPerAttr)
    return valueEntropy

  def getClassLabel(self, attrValueMap):
    classDistribution = {}
    for attrValue in attrValueMap:
      for classLabel in attrValueMap[attrValue]:
        if classLabel != '#':
          if classDistribution.has_key(classLabel):
            classDistribution[classLabel] += 1
          else:
            classDistribution[classLabel] = 1

    count = 0
    for label in classDistribution:
      if classDistribution[label] > count:
        count = classDistribution[label]
        bestClassLabel = label

    return bestClassLabel

  #If a value is present in test set but absent in traning set then we pick the first value of the attribute
  def missingValue(self, nodeElement):
    arrayBranchMap= nodeElement.element()._branchSplitMap.items()
    for key, value in arrayBranchMap:
      if value == 0:
        return key

  def predict(self, featureArray):
    root = self.decisionTree.root()

    while not self.decisionTree.is_leaf(root):
      conditionAttrIndex = root.element()._conditionAttrIndex
      # The feature values can turn out to be numbers
      branchSplitValue = str(featureArray[conditionAttrIndex])
      if not root.element()._branchSplitMap.has_key(branchSplitValue):
        childIndex = root.element()._branchSplitMap[self.missingValue(root)]
      else:
        childIndex = root.element()._branchSplitMap[branchSplitValue]
      root = self.decisionTree.iThChild(root, childIndex)
    return root.element()._classLabel

  def print_Tree(self):
    root = self.decisionTree.root()
    print root

    for conditionAttrIndexs in root.element()._branchSplitMap:
      print root.element()._branchSplitMap
      childIndex = root.element()._branchSplitMap['8']
      root = self.decisionTree.iThChild(root, childIndex)
      print root.element()._classLabel
      print conditionAttrIndexs
      print '\n'
class DecisionTreeBuilder(object):
    """Contains dataset used to train & build a decisiontree,
     Also contains the decision tree that is learnt"""

    __slots__ = '_dataMatrix', 'decisionTree', '_classLabelIndex', '_rowIndicesSet', '_colIndicesSet'

    def __init__(self, dataMatrix, classLabelIndex=None):
        if not dataMatrix:
            raise ValueError('The data should not be empty')
        self._dataMatrix = dataMatrix
        self.decisionTree = DecisionTree()

        if not classLabelIndex:
            self._classLabelIndex = len(dataMatrix[0]) - 1
        else:
            self._classLabelIndex = classLabelIndex

        self._rowIndicesSet = set(range(len(dataMatrix)))
        self._colIndicesSet = set(range(len(
            dataMatrix[0]))) - {self._classLabelIndex}

    def build(self,
              maxDepth=1,
              rowIndicesSet=None,
              colIndicesSet=None,
              root=None):
        if not rowIndicesSet:
            rowIndicesSet = self._rowIndicesSet
        if not colIndicesSet:
            colIndicesSet = self._colIndicesSet
        if maxDepth < 1:
            raise ValueError('Depth should atleast be 1')

        bestAttrResult = self.getBestAttribute(rowIndicesSet, colIndicesSet)

        conditionAttrIndex = bestAttrResult[0]
        countMap = bestAttrResult[1]
        branchSplitMap = {}
        count = 0

        # To ensure proper order for children
        for attrValue in sorted(countMap[conditionAttrIndex]):
            branchSplitMap[attrValue] = count
            count += 1

        classLabel = self.getClassLabel(countMap[conditionAttrIndex])
        decisionElement = DecisionTree.DecisionElement(conditionAttrIndex,
                                                       branchSplitMap,
                                                       classLabel)

        if not root:
            new_root = self.decisionTree.add_root(decisionElement)
        else:
            new_root = self.decisionTree.add_child(root, decisionElement)

        hasZeroEntropy = True
        label = None
        for index in rowIndicesSet:
            if not label:
                label = self._dataMatrix[index][self._classLabelIndex]
            else:
                if label != self._dataMatrix[index][self._classLabelIndex]:
                    hasZeroEntropy = False
                    break

        # Check if the building the table should be continued
        # 1. Zero entropy
        # 2. has the maxDepth reached
        if hasZeroEntropy or self.decisionTree.height() >= maxDepth:
            return new_root

        # Inserting children in proper order
        for attrValue in sorted(countMap[conditionAttrIndex]):

            nextColIndicesSet = colIndicesSet - {conditionAttrIndex}
            nextRowIndicesSet = countMap[conditionAttrIndex][attrValue]['#']

            # 3. more features and rows remain to be processed
            if len(nextColIndicesSet) > 0 and len(nextRowIndicesSet) > 0:
                self.build(maxDepth, nextRowIndicesSet, nextColIndicesSet,
                           new_root)

        return new_root

    def getBestAttribute(self, rowIndicesSet, colIndicesSet):
        if type(rowIndicesSet) is not set or type(colIndicesSet) is not set:
            raise TypeError(
                'This function expects sets of row and column indices')
        if not rowIndicesSet or not colIndicesSet:
            raise ValueError('The sets should contains atleast one index')

        # If only one attribute is remaining

        countMap = self.prepareCountMap(rowIndicesSet, colIndicesSet)
        entropy = 1.0
        for attribute in countMap:
            new_entropy = self.entropy(countMap[attribute])
            if entropy >= new_entropy:
                entropy = new_entropy
                bestAttr = attribute
        return (bestAttr, countMap)

    def prepareCountMap(self, rowIndicesSet, colIndicesSet):
        countMap = {}
        for i in colIndicesSet:
            countMap[i] = {}
        for rowNum in rowIndicesSet:
            for attrIndex in colIndicesSet:

                attrValue = self._dataMatrix[rowNum][attrIndex]
                classLabelValue = self._dataMatrix[rowNum][
                    self._classLabelIndex]

                if countMap[attrIndex].has_key(attrValue):
                    if countMap[attrIndex][attrValue].has_key(classLabelValue):
                        countMap[attrIndex][attrValue][classLabelValue] += 1.0
                    else:
                        countMap[attrIndex][attrValue][classLabelValue] = 1.0
                    countMap[attrIndex][attrValue]['#'].add(rowNum)
                else:
                    countMap[attrIndex][attrValue] = {}
                    countMap[attrIndex][attrValue][classLabelValue] = 1.0
                    countMap[attrIndex][attrValue]['#'] = set([rowNum])
        return countMap

    def entropy(self, attrMap):
        entropy = 0.0
        countPerValue = 0.0
        for attrValueMap in attrMap:
            for classLabel in attrMap[attrValueMap]:
                if classLabel != '#':
                    countPerValue += attrMap[attrValueMap][classLabel]

        for attrValueMap in attrMap:
            entropy += self.entropy_contribution(attrMap[attrValueMap],
                                                 countPerValue)
        return entropy

    def entropy_contribution(self, attrValueMap, countPerAttr):
        valueEntropy = 0.0
        countPerAttrValue = 0.0
        #counts the countPerAttrValue
        for classLabel in attrValueMap:
            if classLabel != '#':
                countPerAttrValue += attrValueMap[classLabel]
        #calculating weighted average for entropy
        for classLabel in attrValueMap:
            if classLabel != '#':
                valueEntropy += -(attrValueMap[classLabel] / countPerAttrValue * \
                                log(attrValueMap[classLabel] / countPerAttrValue, 2)) * \
                                (countPerAttrValue/countPerAttr)
        return valueEntropy

    def getClassLabel(self, attrValueMap):
        classDistribution = {}
        for attrValue in attrValueMap:
            for classLabel in attrValueMap[attrValue]:
                if classLabel != '#':
                    if classDistribution.has_key(classLabel):
                        classDistribution[classLabel] += 1
                    else:
                        classDistribution[classLabel] = 1

        count = 0
        for label in classDistribution:
            if classDistribution[label] > count:
                count = classDistribution[label]
                bestClassLabel = label

        return bestClassLabel

    #If a value is present in test set but absent in traning set then we pick the first value of the attribute
    def missingValue(self, nodeElement):
        arrayBranchMap = nodeElement.element()._branchSplitMap.items()
        for key, value in arrayBranchMap:
            if value == 0:
                return key

    def predict(self, featureArray):
        root = self.decisionTree.root()

        while not self.decisionTree.is_leaf(root):
            conditionAttrIndex = root.element()._conditionAttrIndex
            # The feature values can turn out to be numbers
            branchSplitValue = str(featureArray[conditionAttrIndex])
            if not root.element()._branchSplitMap.has_key(branchSplitValue):
                childIndex = root.element()._branchSplitMap[self.missingValue(
                    root)]
            else:
                childIndex = root.element()._branchSplitMap[branchSplitValue]
            root = self.decisionTree.iThChild(root, childIndex)
        return root.element()._classLabel

    def print_Tree(self):
        root = self.decisionTree.root()
        print root

        for conditionAttrIndexs in root.element()._branchSplitMap:
            print root.element()._branchSplitMap
            childIndex = root.element()._branchSplitMap['8']
            root = self.decisionTree.iThChild(root, childIndex)
            print root.element()._classLabel
            print conditionAttrIndexs
            print '\n'
    def build(self,
              maxDepth=1,
              rowIndicesSet=None,
              colIndicesSet=None,
              root=None):
        if not rowIndicesSet:
            rowIndicesSet = self._rowIndicesSet
        if not colIndicesSet:
            colIndicesSet = self._colIndicesSet
        if maxDepth < 1:
            raise ValueError('Depth should atleast be 1')

        bestAttrResult = self.getBestAttribute(rowIndicesSet, colIndicesSet)

        conditionAttrIndex = bestAttrResult[0]
        countMap = bestAttrResult[1]
        branchSplitMap = {}
        count = 0

        # To ensure proper order for children
        for attrValue in sorted(countMap[conditionAttrIndex]):
            branchSplitMap[attrValue] = count
            count += 1

        classLabel = self.getClassLabel(countMap[conditionAttrIndex])
        decisionElement = DecisionTree.DecisionElement(conditionAttrIndex,
                                                       branchSplitMap,
                                                       classLabel)

        if not root:
            new_root = self.decisionTree.add_root(decisionElement)
        else:
            new_root = self.decisionTree.add_child(root, decisionElement)

        hasZeroEntropy = True
        label = None
        for index in rowIndicesSet:
            if not label:
                label = self._dataMatrix[index][self._classLabelIndex]
            else:
                if label != self._dataMatrix[index][self._classLabelIndex]:
                    hasZeroEntropy = False
                    break

        # Check if the building the table should be continued
        # 1. Zero entropy
        # 2. has the maxDepth reached
        if hasZeroEntropy or self.decisionTree.height() >= maxDepth:
            return new_root

        # Inserting children in proper order
        for attrValue in sorted(countMap[conditionAttrIndex]):

            nextColIndicesSet = colIndicesSet - {conditionAttrIndex}
            nextRowIndicesSet = countMap[conditionAttrIndex][attrValue]['#']

            # 3. more features and rows remain to be processed
            if len(nextColIndicesSet) > 0 and len(nextRowIndicesSet) > 0:
                self.build(maxDepth, nextRowIndicesSet, nextColIndicesSet,
                           new_root)

        return new_root