def evaluate(self, Classifier, *args): proportion = 0 average = 0.0 accuracies = [] performance = Performance() trainingSet = DataSet() for num in range(0, len(args[0])): if args[0][num] == "-t": trainingSet.load(args[0][num + 1]) if args[0][num] == "-T": testSet = DataSet() testSet.load(args[0][num + 1]) if args[0][num] == "-p": proportion = float(args[0][num + 1]) for items in range( 0, int(proportion * len(trainingSet.getExamples().getExamplesList()))): trainingSet.getExamples().add( trainingSet.getExamples().getExamplesList()[items]) trainingSet.setAttributes(trainingSet.getAttributes()) if type(Classifier) == ID3: Classifier.train(trainingSet) performance = Classifier.classify(testSet) return str(performance) else: print "Error in Evaluator:evaluate" performance = Classifier.classify(testSet) return str(performance) for num in range(0, self.folds): testSet = DataSet() trainSet = DataSet() for items in trainingSet.getExamples().getExamplesList(): randomNum = random.randint(0, self.folds - 1) if randomNum != num: testSet.getExamples().add(items) else: trainingSet.getExamples().add(items) testSet.setAttributes(trainingSet.getAttributes()) trainSet.setAttributes(trainingSet.getAttributes()) if (len(trainingSet.attributes.attributes) > 0): trainSet = trainingSet Classifier.train(trainSet) tempPerformance = Classifier.classify(testSet) accuracies.append(tempPerformance.accuracy) average += tempPerformance.accuracy performance += tempPerformance return str(performance) + " +- " + str(self.stdDev( accuracies, average))
def evaluate(self, Classifier, *args): proportion = 0 average = 0.0 accuracies = [] performance = Performance() trainingSet = DataSet() for num in range(0, len(args[0])): if args[0][num] == "-t": trainingSet.load(args[0][num+1]) if args[0][num] == "-T": testSet = DataSet() testSet.load(args[0][num+1]) if args[0][num] == "-p": proportion = float(args[0][num+1]) for items in range(0, int(proportion * len(trainingSet.getExamples().getExamplesList()))): trainingSet.getExamples().add(trainingSet.getExamples().getExamplesList()[items]) trainingSet.setAttributes(trainingSet.getAttributes()) if type(Classifier) == ID3: Classifier.train(trainingSet) performance = Classifier.classify(testSet) return str(performance) else: print "Error in Evaluator:evaluate" performance = Classifier.classify(testSet) return str(performance) for num in range(0, self.folds): testSet = DataSet() trainSet = DataSet() for items in trainingSet.getExamples().getExamplesList(): randomNum = random.randint(0,self.folds-1) if randomNum != num: testSet.getExamples().add(items) else: trainingSet.getExamples().add(items) testSet.setAttributes(trainingSet.getAttributes()) trainSet.setAttributes(trainingSet.getAttributes()) if (len(trainingSet.attributes.attributes) > 0): trainSet = trainingSet Classifier.train(trainSet) tempPerformance = Classifier.classify(testSet) accuracies.append(tempPerformance.accuracy) average += tempPerformance.accuracy performance += tempPerformance return str(performance) + " +- " + str(self.stdDev(accuracies, average))
#f = dataPath + "IBk\\sample_set_life.gla" #f = dataPath + "IBk\\sample_set_word.gla" #f = dataPath + "DataSet_Client Document Preparation for Engine Tuning.gla" f = dataPath + "HospitalDocuments.gla" f = dataPath + "DataSets\\20160126_1501_ClientSiteData.gla" f = dataPath + "DataSets\\20160129_1322_ClientSiteData.gla" f = dataPath + "DataSets\\20160129_1358_ClientSiteData.gla" f = dataPath + "DataSets\\20160201_1530_ClientSiteData.gla" ds = DataSet(f) dt = DecisionTree() es = Estimator() pr = Prune() a = ds.getAttributes() b, c, d = ds.getTrainValidateTestSet(.7) #b, d, c = ds.getTrainValidateTestSet(.7) #b, d = ds.getTrainTestSet() #print len(b), len(c), len(d) dt.train(b,a, 4, 3) output = dt.test(d) print "Single DT on c: {0}%".format(round(es.accuracy(output)*100, 2)) print "train\t\t", len(b), b.getAllLabels() print "validate\t", len(c), c.getAllLabels() print "test\t\t", len(d), d.getAllLabels() print "Output\t\t\t{0}\n".format([o[0] for o in output]) print "Output {0}".format(set([o[0] for o in output]))
""" types = { "levenshtein": Distance().levenshtein, "l": Distance().levenshtein, 0: Distance().levenshtein , "hamming": Distance().hamming, "h": Distance().hamming, 1: Distance().hamming , "euclidean": Distance().euclidean, "e": Distance().euclidean, 2: Distance().euclidean , "manhattan": Distance().manhattan, "m": Distance().manhattan, 3: Distance().manhattan , "chebyshev": Distance().chebyshev, "c": Distance().chebyshev, 4: Distance().chebyshev } results = [types[distanceType](x.getValue(), data) for x in self.trainset] results = [(i,x) for i,x in enumerate(results)] kernels = sorted(results, key = lambda x:x[1])[:3] kernels = [self.trainset[i].getLabel() for i,x in kernels] kernels = [(n, kernels.count(n)) for n in set(kernels)] return sorted(kernels, key = lambda x:x[1], reverse = True)[0][0] if __name__ == "__main__": from DataSet import DataSet ds = DataSet("C:\\Users\\a5rjqzz\\Desktop\\Python\\pyClassifiers\\data\\IBk\\sample_set_lang.gla") bk = IBk() bk.train(ds.getExamples()) kn = ds.convert("y n n") cl = bk.classify(kn, 3) print cl print ds.getAttributes(1)[-1].getLabel(cl)
class knn(Classifier): def __init__(self, *args): super(knn, self).__init__(*args) self.k = 3 self.instances = DataSet() self.setOptions(args) def train(self, inDataSet): self.instances = inDataSet def classify(self, input): neighbors = [] if type(input) == Example: for index, item in enumerate(self.instances.getExamples().getExamplesList()): if len(neighbors) < self.k: tempNeighbor = neighbor() tempNeighbor.setNeighbor(self.instances.getAttributes().getAttributesList()[self.instances.getAttributes().getClassIndex()].domain[item.values[self.instances.getAttributes().getClassIndex()]], self.distance(input, item)) neighbors.append(tempNeighbor) else: highestDist = -1 highestIndex = -1 for num in range(0, len(neighbors)): if num < len(neighbors)-1: if neighbors[num].distance >= neighbors[num + 1].distance: highestDist = neighbors[num].distance highestIndex = num else: highestDist = neighbors[num + 1].distance highestIndex = num + 1 elif neighbors[num] < highestDist: highestDist = neighbors[num] highestIndex = num if self.distance(input, self.instances.getExamples().getExamplesList()[index]) < highestDist: newNeighbor = neighbor() newNeighbor.setNeighbor(self.instances.getAttributes().getAttributesList()[self.instances.getAttributes().getClassIndex()].domain[item.values[self.instances.getAttributes().getClassIndex()]], self.distance(input,item)) neighbors[highestIndex] = newNeighbor return self.vote(neighbors) elif type(input) == DataSet: rightCount = 0 for index, item in enumerate(self.instances.getExamples().getExamplesList()): if self.classify(self.instances.getExamples().getExamplesList()[index]) == self.instances.getAttributes().getClassAttribute().domain[self.instances.getExamples().getExamplesList()[index].values[self.instances.getExamples().attributes.getClassIndex()]]: rightCount += 1 performance = Performance() performance.setPerf(rightCount, len(self.instances.getExamples().getExamplesList())) return performance def setOptions(self, arguments): for num in range(0, len(arguments[0])): if arguments[0][num] == "-k": self.k = int(arguments[0][num+1]) elif arguments[0][num] == "-t": newDataSet = DataSet() newDataSet.load(arguments[0][num+1]) self.instances = newDataSet def distance(self, observation, example): total = 0 for num in range(0, len(observation.attributes.getAttributesList())-1): if observation.values[num] != example.values[num]: total += 1 return total def vote(self, neighbors): voteDict = {} for index, items in enumerate(neighbors): if items.classifier in voteDict.keys(): voteDict[items.classifier] += 1 else: voteDict[items.classifier] = 1 return max(voteDict, key = voteDict.get)