class DecisionTree(object): """ A Decision Tree Class """ def __init__(self, datafile, splitmetric): """ Constructor: @param datafile: training set, path to training data file @param splitmetric: fitness metric/function to choose best splitter """ self.datasource = DataSource(datafile) self.splitmetric = splitmetric self.decisiontree = None def createDecisionTree(self): """ Create a decision tree against training set """ self.decisiontree = self.__treeGrowth__(self.datasource.dataset, self.datasource.attributes, self.datasource.targetAttr) return self def makeDecision(self, sample = None, testset = None, testfile = None ): """ Make decision against test set/sample """ testsamples = [] if testfile: ds = DataSource(testfile) testsamples.extend( ds.dataset ) elif testset: testsamples.extend(testset) elif sample: testsamples.append(sample) else: raise ValueError('No test set passed in.') for test in testsamples: dtree = self.decisiontree while True: attr = dtree.data.attribute node = dtree.getNode(test[attr], False) if node.isBranch(): test['decision?'] = node.data.attribute break else: dtree = node print(test) def prettyTree(self): """" Print decision tree """ try: self.decisiontree.prettyTree() except AttributeError: raise AttributeError('You havn\'t create decision tree yet, please make sure you have called createDecisionTree.') def __treeGrowth__(self, dataset, attributes, target): """ Grows decision tree based on training set @param dataset: training set @param attributes: attribute set, which may contains target attribute @param target: target attribute """ #Target values tvals = [record[target] for record in dataset] default = self.datasource.majorityValue(dataset) # If the data set is empty or the attributes list is empty, return the # default value. if not dataset or (len(attributes) - 1) <= 0: return Tree(DecisionNode(default)) # If all the records in the data set have the same classification, # return that classification. elif tvals.count(tvals[0]) == len(tvals): return Tree(DecisionNode(tvals[0])) else: # Choose best attribute to best classify data best = self.splitmetric(dataset, attributes, target) # Create a new decision tree/node with the best attribute dtree = Tree(DecisionNode(best)) #Attributes for next iterator, all attributes except already fitted `best` attribute attrs = [attr for attr in attributes if attr != best] # Create a new decision tree/sub-node for each of the values in the # best attribute field for val in self.datasource.uniqueValues(dataset, best): # Create a subtree for the current value under the "best" field subtree = self.__treeGrowth__( self.datasource.subDataSet(dataset, best, val), attrs, target) # Set decision condition, and add the new subtree subtree.data.condition = val dtree.addChild( subtree) return dtree
class NaiveBayes(object): ''' Naive Bayes Classifier ''' def __init__(self, datafile): ''' Constructor: @param datafile: training set ''' self.datasource = DataSource(datafile) def classify(self, contAttrs = [], **kwargs): """" Classify test sample/set using NB classifier @param contAttrs: specify the continual attributes in test sample/set @param kwargs: attribute-value pair or test set, once testset=... is given, only test set is used """ clazzes = self.datasource.getClasses() testset = [] if "testset" in kwargs.keys(): testset = kwargs["testset"] else: testset.append(kwargs) prob = 0.0 clazzProb = [] for record in testset: prob = 0.0 clazzProb = [] for clss in clazzes: attrval = {} attrval[self.datasource.targetAttr] = clss prob = self.probD(self.datasource.dataset, attrval) for attr, value in record.items(): if attr == self.datasource.targetAttr: continue attrval = {} attrval[attr] = value if attr in contAttrs: prob *= self.conditionalEstimation(clss, attrval, continual = True) else: prob *= self.conditionalEstimation(clss, attrval, continual = False) clazzProb.append([clss, prob]) classlabel = [c[0] for c in clazzProb if c[1] == max(p[1] for p in clazzProb)][0] record['?'] = classlabel def probD(self, dataset, attrvalDict=None, **kwargs): """" Calculate probability for DISCRETE Attributes. """ totalrecords = len(dataset) #Total number of records attrmatch = 0 #Determine whether record meet the attribute(s) condition matchrecords = 0 #Number of records which match the attribute(s) condition for record in dataset: if not attrvalDict: attrvalDict = kwargs for k , v in attrvalDict.items(): if record[str(k)] == v: attrmatch += 1 #If number of attributes, which match condition, equals conditions' length, #then record matches attribute conditions if attrmatch == len(attrvalDict): matchrecords += 1 #return matchrecords * 1.0 / totalrecords return (matchrecords * 1.0 + 3 * 1.0 /3 ) / ( totalrecords + 3) def probC(self, dataset, attrvalDict=None, **kwargs): """" Calculate probability for CONTINUAL Attributes. For sure kwargs only contains one attribute-value pair, if not, only the first pair (but not guaranteed) will be used """ if not attrvalDict: attrvalDict = kwargs attrname, value = attrvalDict.popitem() v_seq = [int(v[attrname]) for v in dataset] n = len(v_seq) seq_mean = sum(v_seq) * 1.0 / n seq_std = math.sqrt(sum((x - seq_mean)**2 for x in v_seq) / n) estimate = (1.0 / (math.sqrt(2 * math.pi) * seq_std)) * math.exp(-1 * (int(value) - seq_mean)**2 / (2.0 * seq_std **2)) return estimate def conditionalEstimation(self, condition, attrvalDict=None, continual = False, **kwargs): """ Estimates probability of attribute(s) under condition of certain class Note this method only suitable for DISCRETE attributes @param condition: certain class label @param continual: True for continual attribute, the default value is False @param **kwargs: key-value pair of attribute(s) being estimated @return: conditional probability for attribute(s) """ dataset = self.datasource.subDataSet(self.datasource.dataset, self.datasource.targetAttr, condition) if continual: return self.probC(dataset, attrvalDict, **kwargs) else: return self.probD(dataset, attrvalDict, **kwargs) def evidence(self, attrvalDict=None, **kwargs): """" Calculate evidence of attribute(s) condition Note for if kwargs contains continual attribute then evidence is not a appropriate measurement when training sample is small, but for large training set, calculate evidence for continual attribute is reasonable. """ return self.probD(self.datasource.dataset, attrvalDict, **kwargs)