def knn(self, predictData=None, trainData=None): h = hp() k = knn() accuracy = [] precision = [] recall = [] f_score = [] mean, stdDev = h.normalizeData(trainData) nn = int(input("Enter the number of closest neighbors to consider: ")) h.normalizeEvaluationSet(predictData, mean, stdDev) for i in range(len(trainData)): tmp = None predictData = trainData[i] tmp = [lt for j, lt in enumerate(trainData) if j != i] td = h.convertToList(tmp) k.classify(td, predictData, nn) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParams( predictData) accuracy.append( h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) precision.append(tmpPrecision) recall.append(tmpRecall) f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) return accuracy, precision, recall, f_score
def bayes_naive(self, predictData, trainData): h = hp() nb = bayes() accuracy = [] precision = [] recall = [] f_score = [] for i in range(len(trainData)): tmp = None predictData = trainData[i] tmp = [lt for j, lt in enumerate(trainData) if j != i] td = h.convertToList(tmp) classPriorProbabilities = nb.findClassPriorProbability(td) classes = nb.segregateClasses(td) occurences, means, stdDev = nb.findDescriptorPosteriorProbabilites( classes, td) nb.classify(predictData, classPriorProbabilities, occurences, means, stdDev) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParams( predictData) accuracy.append( h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) precision.append(tmpPrecision) recall.append(tmpRecall) f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) return accuracy, precision, recall, f_score
def knn2(self, predictData, trainData, labels): h = hp() pd = list() finalAnswer = defaultdict(list) pca = PCA(n_components=30) for point in predictData: pd.append(point.point) knn = KNeighborsClassifier(n_neighbors=9, metric='euclidean') tmp = list() tmpLabels = list() for lt in trainData: for point in lt: tmp.append(point.point) tmpLabels.append(labels[point.id]) # X_transformed = pca.fit_transform(tmp) # newdata_transformed = pca.transform(pd) std_scale = preprocessing.StandardScaler().fit(tmp) X_transformed = std_scale.transform(tmp) newdata_transformed = std_scale.transform(pd) knn.fit(np.array(X_transformed), np.array(tmpLabels)) y_pred = knn.predict(np.array(newdata_transformed)) k = 0 for i in range(418, 796): finalAnswer[i] = y_pred[k] k+=1 return finalAnswer
def svm(self, predictData, trainData, labels): h = hp() matrix = defaultdict(list) finalAnswer = defaultdict(list) pca = PCA(n_components=25) for i in range(len(trainData)): tmp = list() tmpLabels = list() pd = list() for j, lt in enumerate(trainData): if j != i: for point in lt: tmp.append(point.point) tmpLabels.append(labels[point.id]) for point in predictData: pd.append(point.point) clf = SVC() X_transformed = pca.fit_transform(tmp) newdata_transformed = pca.transform(pd) # mean, stdDev = h.normalizeData(trainData) # h.normalizeEvaluationSet(predictData, mean, stdDev) clf.fit(np.array(X_transformed), np.array(tmpLabels)) y_pred = clf.predict(np.array(newdata_transformed)) k = 0 for i in range(418, 796): finalAnswer[i].append(y_pred[k]) k+=1 for key in finalAnswer: matrix[key] = 0 if finalAnswer[key].count(0) > finalAnswer[key].count(1) else 1 return matrix
def logisticRegression(self, predictData, trainData, labels): h = hp() # mean, stdDev = h.normalizeData(trainData) # h.normalizeEvaluationSet(predictData, mean, stdDev) finalAnswer = defaultdict(list) pd = list() matrix = defaultdict(list) for point in predictData: pd.append(point.point) for i in range(len(trainData)): tmp = list() tmpLabels = list() for j, lt in enumerate(trainData): if j != i: for point in lt: tmp.append(point.point) tmpLabels.append(labels[point.id]) pca = PCA(svd_solver='full') pca_matrix = pca.fit_transform(tmp) pca1_matrix = pca.transform(pd) lr = LogisticRegression(solver='sag', max_iter=1500) lr.fit(pca_matrix, np.array(tmpLabels)) y_pred = lr.predict(np.array(pca1_matrix)) k = 0 for d in range(418, 796): finalAnswer[d].append(y_pred[k]) k+=1 for key in finalAnswer: matrix[key] = 0 if finalAnswer[key].count(0) > finalAnswer[key].count(1) else 1 return matrix
def knn(self, predictData = None, trainData = None): h = hp() k = knn() # mean, stdDev = h.normalizeData(trainData) nn = int(input("Enter the number of closest neighbors to consider: ")) # h.normalizeEvaluationSet(predictData, mean, stdDev) tmp = [lt for j, lt in enumerate(trainData)] td = h.convertToList(tmp) k.classify(td, predictData,nn)
def findDescriptorPosteriorProbabilites(self, classes, td): occurences = defaultdict(int) mean, stdDeviation = defaultdict(dict), defaultdict(dict) for key in classes: tmp = classes[key] mean[key], stdDeviation[key] = hp().standardizeBayes(tmp) for pt in tmp: for index, i in enumerate(pt.categoricalData): if (i, key) not in occurences: count = self.countOccurence(i, index, tmp) occurences[(i, key)] = count / len(tmp) return occurences, mean, stdDeviation
def bayes_naive(self, predictData, trainData): h = hp() nb = bayes() matrix = defaultdict(list) pd = [pt for pt in predictData] # for i in range(len(trainData)): tmp = [lt for j, lt in enumerate(trainData)] td = h.convertToList(tmp) classPriorProbabilities = nb.findClassPriorProbability(td) classes = nb.segregateClasses(td) occurences, means, stdDev = nb.findDescriptorPosteriorProbabilites(classes, td) nb.classify(predictData, classPriorProbabilities, occurences, means, stdDev) return predictData
def knnDemo(self, predictData=None, trainData=None): h = hp() k = knn() nn = int(input("Enter the number of closest neighbors to consider: ")) k.classify(trainData, predictData, nn) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParams( predictData) accuracy = h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives) precision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) recall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) f_score = h.findFMeasure(precision, recall) return accuracy, precision, recall, f_score
def bayes_naive_demo(self, predictData, trainData): h = hp() nb = bayes() classPriorProbabilities = nb.findClassPriorProbability(trainData) classes = nb.segregateClasses(trainData) occurences, means, stdDev = nb.findDescriptorPosteriorProbabilites( classes, trainData) probabilities = nb.classify_demo(predictData, classPriorProbabilities, occurences, means, stdDev) maxProb = float('-inf') classKey = -1 for key in probabilities: print("P(X|H{})*P(H{}) = {}".format(key, key, probabilities[key])) if probabilities[key] > maxProb: maxProb = probabilities[key] classKey = key print("This test data record belongs to: Class {}".format(classKey))
def dt(data, labels, test_features=None): from helpers import helpers as hp from decision_tree import decisionTree h = hp() dt = decisionTree() data = pd.concat([data, labels], axis=1) data.dropna(inplace=True) print(data.head()) trainAccuracy = [] testAccuracy = [] precision = [] recall = [] f_score = [] models = [] foldSize = int(data.shape[0] / 10) for i in range(10): print("Running iteration " + str(i+1) + " of k cross validation") testData = data.loc[foldSize*i:foldSize*(i+1)-1] # testData = pd.DataFrame(stats.zscore(testData.iloc[:,:-1], axis=1), columns=testData.columns) trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) trainData = trainData[(np.abs(stats.zscore(trainData.iloc[:,:-1])) < 3).all(axis=1)] root = dt.decision(trainData, depth=10, minLeafRows=5) testTarget = testData.iloc[:,-1].values.tolist() # testPredicted = dt.predictData(testData.iloc[:, :-1], root) testPredicted = dt.predictData(pd.DataFrame(stats.zscore(testData.iloc[:,:-1], axis=1), columns=testData.columns.values.tolist()[:-1]), root) trainTarget = trainData.iloc[:,-1].values.tolist() trainPredicted = dt.predictData(trainData.iloc[:, :-1], root) models.append(root) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(trainPredicted, trainTarget) trainAccuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(testPredicted, testTarget) testAccuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) precision.append(tmpPrecision) recall.append(tmpRecall) f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) h.calculateMetrics(testAccuracy, precision, recall, f_score) return trainAccuracy, testAccuracy, precision, recall, models, dt
def rf(data, labels, test_features=None): from random_forest import randomForest from helpers import helpers as hp from decision_tree import decisionTree h = hp() rf = randomForest() dt = decisionTree() data = pd.concat([data, labels], axis=1) # print(data) accuracy = [] precision = [] recall = [] f_score = [] models = [] fb_score = [] foldSize = int(data.shape[0] / 5) for i in range(5): print("Running iteration " + str(i+1) + " of k cross validation") testData = data.loc[foldSize*i:foldSize*(i+1)-1] trainData = data.loc[:foldSize*i-1].append(data.loc[foldSize*(i+1):]) forest = rf.forest(trainData) target = testData.iloc[:,-1].values.tolist() predicted = rf.predictForest(testData.iloc[:, :-1], forest) models.append(forest) calMetrics(target, predicted) # truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters(predicted, target) # print(truePositives, trueNegatives, falsePositives, falseNegatives) # accuracy.append(h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) # tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) # tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) # precision.append(tmpPrecision) # recall.append(tmpRecall) # tm_fscore = h.findFMeasure(tmpPrecision, tmpRecall) # print(tm_fscore) # f_score.append(tm_fscore) h.calculateMetrics(accuracy, precision, recall, f_score) # print(accuracy, precision, recall, f_score) # h.calculateMetrics(accuracy, precision, recall, f_score) # ind = f_score.index(min(f_score)) # print(f_score[ind]) # pred = rf.predictForest(test_features, models[ind]) # print(pred) predicted = pd.DataFrame() for root in models: pred = dt.predictData(test_features, root) predicted = pd.concat([predicted, pd.DataFrame(pred)], axis=1) print(predicted) p = pd.DataFrame() p = [] for idx, row in predicted.iterrows(): p.append(row.value_counts().index.tolist()[0]) print(p) return p
def random_forest(self, kCrossValidation): print("\nRunning Random Forest Classifier ....................\n") from random_forest import randomForest h = hp() fileName = h.get_fileName() filePath = "../Data/" + fileName + ".txt" # filePath = "CSE-601/project3/Data/"+fileName+".txt" data, labels = h.readData(filePath) data = h.oneHotEncoding(data, labels) rf = randomForest() try: numTrees = int(input("\nEnter number of trees: ")) numFeatures = int(input("Enter number of features to consider: ")) except: print("\nExecution Failed - Wrong Input") exit() accuracy = [] precision = [] recall = [] f_score = [] models = [] foldSize = int(data.shape[0] / kCrossValidation) for i in range(kCrossValidation): print("Running iteration " + str(i + 1) + " of k cross validation .....") testData = data.loc[foldSize * i:foldSize * (i + 1) - 1] trainData = data.loc[:foldSize * i - 1].append(data.loc[foldSize * (i + 1):]) forest = rf.forest(trainData, numTrees=numTrees, numFeatures=numFeatures) target = testData.iloc[:, -1].values.tolist() predicted = rf.predictForest(testData.iloc[:, :-1], forest) models.append(forest) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters( predicted, target) accuracy.append( h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) precision.append(tmpPrecision) recall.append(tmpRecall) f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) print("\nMetrics on train data with k-cross validation") h.calculateMetrics(accuracy, precision, recall, f_score) fileName = input( "\nEnter test data file name without extension (if no test file, just press enter): " ) if fileName != '': filePath = "../Data/" + fileName + ".txt" # filePath = "CSE-601/project3/Data/"+fileName+".txt" testData, testLabels = h.readData(filePath) testData = h.oneHotEncoding(testData, testLabels) predLabels = [] for forest in models: predLabels.append(rf.predictForest(testData, forest)) predLabels = pd.DataFrame(predLabels) pred = [] for _, colData in predLabels.iteritems(): pred.append(colData.value_counts().index[0]) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters( pred, testData.iloc[:, -1].values.tolist()) accuracy = [ h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives) ] precision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) recall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) f_score = [h.findFMeasure(precision, recall)] print("\nMetrics on test data with bagging") h.calculateMetrics(accuracy, [precision], [recall], f_score)
def decision_tree(self, kCrossValidation): print("\nRunning Decision Tree Classifier ....................\n") from decision_tree import decisionTree h = hp() fileName = h.get_fileName() # filePath = "../Data/"+fileName+".txt" filePath = "CSE-601/project3/Data/" + fileName + ".txt" data, labels = h.readData(filePath) data = h.oneHotEncoding(data, labels) dt = decisionTree() accuracy = [] precision = [] recall = [] f_score = [] models = [] if kCrossValidation <= 1: root = dt.decision(data) print(root) target = data.iloc[:, -1] predicted = dt.predictData(data.iloc[:, :-1], root) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters( predicted, target) accuracy.append( h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) precision.append(tmpPrecision) recall.append(tmpRecall) f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) else: foldSize = int(data.shape[0] / kCrossValidation) for i in range(kCrossValidation): print("Running iteration " + str(i + 1) + " of k cross validation .....") testData = data.loc[foldSize * i:foldSize * (i + 1) - 1] trainData = data.loc[:foldSize * i - 1].append( data.loc[foldSize * (i + 1):]) root = dt.decision(trainData) target = testData.iloc[:, -1].values.tolist() predicted = dt.predictData(testData.iloc[:, :-1], root) models.append(root) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters( predicted, target) accuracy.append( h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives)) tmpPrecision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) tmpRecall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) precision.append(tmpPrecision) recall.append(tmpRecall) f_score.append(h.findFMeasure(tmpPrecision, tmpRecall)) print("\nMetrics on train data with k-cross validation") h.calculateMetrics(accuracy, precision, recall, f_score) fileName = input( "\nEnter test data file name without extension (if no test file, just press enter): " ) if fileName != '': filePath = "../Data/" + fileName + ".txt" # filePath = "CSE-601/project3/Data/"+fileName+".txt" testData, testLabels = h.readData(filePath) testData = h.oneHotEncoding(testData, testLabels) predLabels = [] for _, row in testData.iloc[:, :-1].iterrows(): predictedRow = [dt.predictRow(row, root) for root in models] predLabels.append( max(set(predictedRow), key=predictedRow.count)) truePositives, trueNegatives, falsePositives, falseNegatives = h.findParameters( predLabels, testData.iloc[:, -1].values.tolist()) accuracy = [ h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives) ] precision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) recall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) f_score = [h.findFMeasure(precision, recall)] print("\nMetrics on test data with bagging") h.calculateMetrics(accuracy, [precision], [recall], f_score)
accuracy = [ h.findAccuracy(truePositives, trueNegatives, falsePositives, falseNegatives) ] precision = h.findPrecision(truePositives, trueNegatives, falsePositives, falseNegatives) recall = h.findRecall(truePositives, trueNegatives, falsePositives, falseNegatives) f_score = [h.findFMeasure(precision, recall)] print("\nMetrics on test data with bagging") h.calculateMetrics(accuracy, [precision], [recall], f_score) if __name__ == "__main__": m = main() h = hp() algorithm = int( input( "Enter 0 to run K-Nearest Neighbors in demo mode\nEnter 1 for K-Nearest Neigbour Algorithm\nEnter 2 for Decision Tree Algorithm\nEnter 3 for Naive Bayes Algorithm\nEnter 4 to run Naive Bayes Algorithm in demo mode\nEnter 5 for Random Forest Algorithm\n" )) if algorithm == 0: print("Enter train File name") trainData = h.get_file_demo(h.get_fileName()) print("Enter test File name") predictData = h.get_file_demo(h.get_fileName(), fileType='predictData') accuracy, precision, recall, f_score = m.knnDemo( predictData, trainData) h.calculateMetricsDemo(accuracy, precision, recall, f_score) elif algorithm == 1: