def KNN_parameter(path): print "Classifier: K Nearest Neighbors" print "KFOLD parameter test" # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle=True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # k in kfold n_cross_val = 5 # calculate results i, uniform_results, weighted_results = KFOLD_KNN_parameter_test( TFIDF, files.target, n_cross_val=n_cross_val, n_neighbors=5) # plot plot_results(i, [uniform_results, weighted_results], ['uniform', 'weighted'])
def NB(path): print ("Classifier: Naive Bayes") print ("Train-Test Split") # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle = True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TF = tf_transformer.transform(BOW) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # build classifier clf = sklearn.naive_bayes.MultinomialNB() # calculate results i, BOW_results = split_test_classifier(clf, BOW, files.target) i, TF_results = split_test_classifier(clf, TF, files.target) i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target) # plot plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
def KNN_parameter(path): print ("Classifier: K Nearest Neighbors") print ("KFOLD parameter test") # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle = True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # k in kfold n_cross_val = 5 # calculate results i, uniform_results, weighted_results = KFOLD_KNN_parameter_test(TFIDF, files.target, n_cross_val = n_cross_val, n_neighbors = 5) # plot plot_results(i, [uniform_results, weighted_results], ['uniform', 'weighted'])
def KNN(path): print "Classifier: K Nearest Neighbors" print "Train-Test Split" # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle=True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TF = tf_transformer.transform(BOW) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # build classifier n_neighbors = 5 # weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # calculate results i, BOW_results = split_test_classifier(clf, BOW, files.target) i, TF_results = split_test_classifier(clf, TF, files.target) i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target) # plot plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
def SVM(path): print "Classifier: Support Vector Machine" print "Train-Test Split" # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle=True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TF = tf_transformer.transform(BOW) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer( use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # build classifier clf = sklearn.svm.LinearSVC() # calculate results i, BOW_results = split_test_classifier(clf, BOW, files.target) i, TF_results = split_test_classifier(clf, TF, files.target) i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target) # plot plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
def KNN(path): print "Classifier: K Nearest Neighbors" print "Train-Test Split" # preprocess main.reorganize_dataset(path) main.remove_incompatible_files(path) # load data files = sklearn.datasets.load_files(path, shuffle = True) # refine emails - delete unwanted text form them util.refine_all_emails(files.data) # feature Extractoin # BOW BOW = util.bagOfWords(files.data) # TF tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TF = tf_transformer.transform(BOW) # TFIDF tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW) TFIDF = tfidf_transformer.transform(BOW) # build classifier n_neighbors = 5 # weights = 'uniform' weights = 'distance' clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights) # calculate results i, BOW_results = split_test_classifier(clf, BOW, files.target) i, TF_results = split_test_classifier(clf, TF, files.target) i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target) # plot plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])