class Benchmark: def __init__(self, classificationType): self.classificationType = classificationType pass class FeaturePrinter(Classifier): def fit(self, X, y): self.preprocessing(X, y) print(self.types) pass def measureOverfitting(self, classifierName): if classifierName == "RandomForests": self.iterations = 200 self.classifier = RandomForests(self.iterations) self.classifier.fit(self.train_X, self.train_y) print("done fitting") trainingScores = self.classifier.checkFitting( self.train_X, self.train_y) testScores = self.classifier.checkFitting(self.test_X, self.test_y) output = open("big_debug_rf.txt", "w") for index, scoreList in enumerate(trainingScores): for ptr, score in enumerate(scoreList): output.write( str(index + 1) + ":" + str(ptr + 1) + ":" + str(score) + "," + str(testScores[index][ptr]) + "\n") output.close() def computeConfusionMatrix(self): confusionMatrix = np.array([[0., 0.], [0., 0.]]) for index, sample in enumerate(self.test_X): predicted = self.classifier.predict(sample) confusionMatrix[predicted][int( self.test_y[index] == self.classifier.shrinked[1])] += 1 return confusionMatrix def measureROC(self, classifierName): if classifierName == "RandomForests": size = len(self.train_X[0]) self.classifier = RandomForests() self.classifier.fit(self.train_X, self.train_y) xs = [] ys = [] output = open("debug_roc.txt", "w") for (randomSize, confusionMatrix) in self.classifier.matrices: sensitivity = confusionMatrix[0][0] / (confusionMatrix[0][0] + confusionMatrix[1][0]) specificity = confusionMatrix[1][1] / (confusionMatrix[1][1] + confusionMatrix[0][1]) xs.append(1 - specificity) ys.append(sensitivity) print( str(randomSize) + " -> " + str(1 - specificity) + "," + str(sensitivity)) output.write( str(randomSize) + ":" + str(1 - specificity) + "," + str(sensitivity) + "\n") print(xs) print(ys) output.close() pass def run(self, classifierName): if classifierName == "FeaturePrinter": self.classifier = self.FeaturePrinter() self.classifier.fit(self.train_X, self.train_y) return if classifierName == "AdaBoost": self.classifier = AdaBoost() elif classifierName == "GradientBoost": self.classifier = GradientBoost() elif classifierName == "DecisionTree": self.classifier = DecisionTree() elif classifierName == "RandomForests": self.classifier = RandomForests() elif classifierName == "LogisticRegression": self.classifier = LogisticRegression(max_iter=10000) elif classifierName == "GaussianNB": self.classifier = GaussianNB() fitStart = time.time() self.classifier.fit(self.train_X, self.train_y) fitStop = time.time() scoreStart = time.time() score = self.classifier.score(self.test_X, self.test_y) scoreStop = time.time() toPrint = classifierName + ": accuracy=" + "{0:.2f}".format( score * 100) + "%, fit_time=" + "{0:.3f}".format( fitStop - fitStart) + ", score_time=" + "{0:.3f}".format(scoreStop - scoreStart) print(toPrint) self.maxLen = max(self.maxLen, len(toPrint)) pass def runAll(self): if self.classificationType == "binary_classification.txt": with open("binary_classification.txt", "r") as file: dataReader = DataReader() for dataName in file: # Read the data dataName = dataName.strip() X, y = dataReader.read(dataName) # Split it into training data and testing data self.train_X, self.test_X, self.train_y, self.test_y = train_test_split( X, y) self.maxLen = 0 print("Data: " + str(dataName)) print("Sizes: (train=" + str(len(self.train_X)) + ", test=" + str(len(self.test_X)) + ")") # And run the classifiers self.run("FeaturePrinter") self.run("AdaBoost") self.run("DecisionTree") self.run("GradientBoost") self.run("RandomForests") self.run("LogisticRegression") self.run("GaussianNB") # And separate print("-" * self.maxLen) elif self.classificationType == "test_roc.txt": with open("test_roc.txt", "r") as file: dataReader = DataReader() for dataName in file: # Read the data dataName = dataName.strip() X, y = dataReader.read(dataName) # Split it into training data and testing data self.train_X, self.test_X, self.train_y, self.test_y = train_test_split( X, y) print("Data: " + str(dataName)) print("Sizes: (train=" + str(len(self.train_X)) + ", test=" + str(len(self.test_X)) + ")") self.measureROC("RandomForests") elif self.classificationType == "test_overfitting.txt": with open("test_overfitting.txt", "r") as file: dataReader = DataReader() for dataName in file: # Read the data dataName = dataName.strip() X, y = dataReader.read(dataName) # Split it into training data and testing data self.train_X, self.test_X, self.train_y, self.test_y = train_test_split( X, y) print("Data: " + str(dataName)) print("Sizes: (train=" + str(len(self.train_X)) + ", test=" + str(len(self.test_X)) + ")") self.measureOverfitting("RandomForests") elif self.classificationType == "test": dataReader = DataReader(False) X, y = dataReader.read("google_dataset.csv") self.train_X, self.test_X, self.train_y, self.test_y = train_test_split( X, y) for index, sample in enumerate(self.train_X): print( str(index) + " : " + str(sample) + " -> " + str(self.train_y[index])) for index, sample in enumerate(self.test_X): print(str(sample) + " -> " + str(self.test_y[index])) self.maxLen = 0 self.run("GradientBoost") pass