def main(dataset_name, testset_name, new_emails=False): '''Runs the knn classifier for a training set dataset_name and test set testset_name''' current_path = os.path.dirname(os.path.abspath(__file__)) + "\\" trainingset_path = current_path + dataset_name + "\\" testset_path = current_path + testset_name + "\\" results_path = testset_path + "results\\" if not os.path.exists(results_path): os.mkdir(results_path) folder_names = next(os.walk(testset_path + "."))[1] if 'results' in folder_names: folder_names.remove('results') if new_emails: folder_names = [""] workfilename = 'mergedworkfile.csv' wordfilename = 'wordfile.csv' # klist = [1, 3, 7, 15, 24, 33, 42, 50] klist = [1, 3] acc = [] ks = [] trainingSet = [] print("Loading Training Set...") wordsd, subd, digramsd, trigramsd = ex.loadTrainingset( trainingset_path, workfilename, wordfilename, trainingSet) print("Training Set loaded.") print('Collecting ' + 'New' * new_emails + 'Test' * (not new_emails) + ' Emails...') testSet, all_files = ex.loadTestset(testset_path, folder_names, wordsd, subd, digramsd, trigramsd) print('New' * new_emails + 'Test' * (not new_emails) + ' Emails Collected.') assert (len(trainingSet[0]) == len(testSet[0])) list_of_predictions = knn.classify(klist, trainingSet, testSet, results_path) if not new_emails: #Finds the predictions and accuracy for new test mails given the predictions for these mails for i in range(len(klist)): predictions = [] for x in range(len(testSet)): predictions.append(list_of_predictions[x][i]) accuracy = knn.getAccuracy(testSet, predictions) acc.append(accuracy) ks.append(klist[i]) print('K: ' + repr(klist[i])) print('Accuracy: ' + repr(accuracy) + '%') print('Overall Accuracy: ' + str(sum(acc) / len(acc)) + "%") plt.plot(ks, acc) plt.xlabel('K') plt.ylabel('Accuracy') plt.show() print('Find the results at: ' + results_path)
def main(dataset_name, testset_name, new_emails = False): if platform.system() == 'Windows': current_path = os.path.dirname(os.path.abspath(__file__)) + "\\" trainingset_path = current_path + dataset_name + "\\" testset_path = current_path + testset_name + "\\" results_path = testset_path + "results\\" elif platform.system() == 'Linux': current_path = os.path.dirname(os.path.abspath(__file__)) + "/" trainingset_path = current_path + dataset_name + "/" testset_path = current_path + testset_name + "/" results_path = testset_path + "results/" if not os.path.exists(results_path): os.mkdir(results_path) folder_names = next(os.walk(testset_path + "."))[1] if 'results' in folder_names: folder_names.remove('results') # folder_names = ["calendar"] if new_emails: folder_names = [""] workfilename = 'mergedworkfile.csv' wordfilename = 'wordfile.csv' trainingSet = [] predicted_folders = [] print("Loading Training Set...") wordsd, subd, digramsd, trigramsd = ex.loadTrainingset(trainingset_path, workfilename, wordfilename, trainingSet) print("Training Set loaded.") print('Collecting ' + 'New'*new_emails + 'Test'*(not new_emails) + ' Emails...') testSet, all_files = ex.loadTestset(testset_path, folder_names, wordsd, subd, digramsd, trigramsd) print('New'*new_emails + 'Test'*(not new_emails) + ' Emails Collected.') assert(len(trainingSet[0]) == len(testSet[0])) # prepare model summaries, classproirprobabilities = mnb.summarizeByClass(trainingSet) # test model predictions = mnb.getPredictions(summaries, classproirprobabilities, testSet, results_path) folder_names = next(os.walk(trainingset_path + "."))[1] if 'results' in folder_names: folder_names.remove('results') for fname in folder_names: if not os.path.exists(results_path + fname): os.mkdir(results_path + fname) for i in range(len(predictions)): shutil.copy2(all_files[i], results_path + folder_names[predictions[i]]) predicted_folders.append(folder_names[predictions[i]]) if not new_emails: accuracy = mnb.getAccuracy(testSet, predictions) print('Accuracy: {0}%'.format(accuracy)) print('Find the results at: ' + results_path) return predicted_folders
def main(dataset_name, testset_name, new_emails = False): '''Runs the mnb classifier for a training set dataset_name and test set testset_name''' current_path = os.path.dirname(os.path.abspath(__file__)) + "\\" trainingset_path = current_path + dataset_name + "\\" testset_path = current_path + testset_name + "\\" results_path = testset_path + "results\\" if not os.path.exists(results_path): os.mkdir(results_path) folder_names = next(os.walk(testset_path + "."))[1] if 'results' in folder_names: folder_names.remove('results') # folder_names = ["calendar"] if new_emails: folder_names = [""] workfilename = 'mergedworkfile.csv' wordfilename = 'wordfile.csv' trainingSet = [] predicted_folders = [] print("Loading Training Set...") wordsd, subd, digramsd, trigramsd = ex.loadTrainingset(trainingset_path, workfilename, wordfilename, trainingSet) print("Training Set loaded.") print('Collecting ' + 'New'*new_emails + 'Test'*(not new_emails) + ' Emails...') testSet, all_files = ex.loadTestset(testset_path, folder_names, wordsd, subd, digramsd, trigramsd) print('New'*new_emails + 'Test'*(not new_emails) + ' Emails Collected.') assert(len(trainingSet[0]) == len(testSet[0])) # prepare model summaries, classpriorprobabilities = mnb.summarizeByClass(trainingSet) # test model predictions = mnb.getPredictions(summaries, classpriorprobabilities, testSet, results_path) folder_names = next(os.walk(trainingset_path + "."))[1] if 'results' in folder_names: folder_names.remove('results') for fname in folder_names: if not os.path.exists(results_path + fname): os.mkdir(results_path + fname) for i in range(len(predictions)): shutil.copy2(all_files[i], results_path + folder_names[predictions[i]]) predicted_folders.append(folder_names[predictions[i]]) if not new_emails: #Finds the accuracy for new test mails given the predictions for these mails accuracy = mnb.getAccuracy(testSet, predictions) print('Accuracy: {0}%'.format(accuracy)) print('Find the results at: ' + results_path) return predicted_folders
def main(dataset_name, testset_name, new_emails = False): '''Runs the knn classifier for a training set dataset_name and test set testset_name''' current_path = os.path.dirname(os.path.abspath(__file__)) + "\\" trainingset_path = current_path + dataset_name + "\\" testset_path = current_path + testset_name + "\\" results_path = testset_path + "results\\" if not os.path.exists(results_path): os.mkdir(results_path) folder_names = next(os.walk(testset_path + "."))[1] if 'results' in folder_names: folder_names.remove('results') if new_emails: folder_names = [""] workfilename = 'mergedworkfile.csv' wordfilename = 'wordfile.csv' # klist = [1, 3, 7, 15, 24, 33, 42, 50] klist = [1, 3] acc = [] ks = [] trainingSet=[] print("Loading Training Set...") wordsd, subd, digramsd, trigramsd = ex.loadTrainingset(trainingset_path, workfilename, wordfilename, trainingSet) print("Training Set loaded.") print('Collecting ' + 'New'*new_emails + 'Test'*(not new_emails) + ' Emails...') testSet, all_files = ex.loadTestset(testset_path, folder_names, wordsd, subd, digramsd, trigramsd) print('New'*new_emails + 'Test'*(not new_emails) + ' Emails Collected.') assert(len(trainingSet[0]) == len(testSet[0])) list_of_predictions = knn.classify(klist, trainingSet, testSet, results_path) if not new_emails: #Finds the predictions and accuracy for new test mails given the predictions for these mails for i in range(len(klist)): predictions = [] for x in range(len(testSet)): predictions.append(list_of_predictions[x][i]) accuracy = knn.getAccuracy(testSet, predictions) acc.append(accuracy) ks.append(klist[i]) print('K: ' + repr(klist[i])) print('Accuracy: ' + repr(accuracy) + '%') print('Overall Accuracy: '+ str(sum(acc)/len(acc)) + "%") plt.plot(ks, acc) plt.xlabel('K') plt.ylabel('Accuracy') plt.show() print('Find the results at: ' + results_path)