Example #1
0
def KNN_parameter(path):
    print "Classifier: K Nearest Neighbors"
    print "KFOLD parameter test"

    # preprocess
    main.reorganize_dataset(path)
    main.remove_incompatible_files(path)

    # load data
    files = sklearn.datasets.load_files(path, shuffle=True)

    # refine emails - delete unwanted text form them
    util.refine_all_emails(files.data)

    # feature Extractoin
    # BOW
    BOW = util.bagOfWords(files.data)
    # TFIDF
    tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TFIDF = tfidf_transformer.transform(BOW)

    # k in kfold
    n_cross_val = 5

    # calculate results
    i, uniform_results, weighted_results = KFOLD_KNN_parameter_test(
        TFIDF, files.target, n_cross_val=n_cross_val, n_neighbors=5)

    # plot
    plot_results(i, [uniform_results, weighted_results],
                 ['uniform', 'weighted'])
def main_test(path=None):
    dir_path = path

    remove_incompatible_files(dir_path)

    print '\n\n'

    # load data
    print colored('Loading files into memory', 'green', attrs=['bold'])
    files = sklearn.datasets.load_files(dir_path)

    # refine all refine_all_emails
    print colored('Refining all files', 'green', attrs=['bold'])
    util.refine_all_emails(files.data)

    # calculate the BOW representation
    print colored('Calculating BOW', 'green', attrs=['bold'])
    word_counts = util.bagOfWords(files.data)

    # TFIDF
    print colored('Calculating TFIDF', 'green', attrs=['bold'])
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts)
    X = tf_transformer.transform(word_counts)

    print '\n\n'
    
    # defining test_size
    test_size = [0.2]   #0.2 means 80% training data and 20% test data

    # create classifier
    print colored('TFIDF with Naive Bayes', 'red', attrs=['bold'])
    clf = sklearn.naive_bayes.MultinomialNB()

    # print '\n'
    for test in test_size:
        test_classifier(X, files.target, clf, test, y_names=files.target_names, confusion=False)


    print '\n\n'

    print colored('TFIDF with Support Vector Machine', 'red', attrs=['bold'])
    clf = sklearn.svm.LinearSVC()

    # print '\n'
    for test in test_size:
        test_classifier(X, files.target, clf, test, y_names=files.target_names, confusion=False)


    print '\n\n'
    
    print colored('TFIDF with K-Nearest Neighbours', 'red', attrs=['bold'])
    n_neighbors = 11
    weights = 'uniform'
    weights = 'distance'
    clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

    # test the classifier
    # print '\n'
    for test in test_size:
        test_classifier(X, files.target, clf, test, y_names=files.target_names, confusion=False)
Example #3
0
def NB(path):
	print ("Classifier: Naive Bayes")
	print ("Train-Test Split")

	# preprocess
	main.reorganize_dataset(path)
	main.remove_incompatible_files(path)

	# load data
	files = sklearn.datasets.load_files(path, shuffle = True)

	# refine emails - delete unwanted text form them
	util.refine_all_emails(files.data)

	# feature Extractoin
	# BOW
	BOW = util.bagOfWords(files.data)
	# TF
	tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TF = tf_transformer.transform(BOW)
	# TFIDF
	tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TFIDF = tfidf_transformer.transform(BOW)

	# build classifier
	clf = sklearn.naive_bayes.MultinomialNB()

	# calculate results
	i, BOW_results = split_test_classifier(clf, BOW, files.target)
	i, TF_results = split_test_classifier(clf, TF, files.target)
	i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target)
	
	# plot
	plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
Example #4
0
def KNN_parameter(path):
	print ("Classifier: K Nearest Neighbors")
	print ("KFOLD parameter test")

	# preprocess
	main.reorganize_dataset(path)
	main.remove_incompatible_files(path)

	# load data
	files = sklearn.datasets.load_files(path, shuffle = True)

	# refine emails - delete unwanted text form them
	util.refine_all_emails(files.data)

	# feature Extractoin
	# BOW
	BOW = util.bagOfWords(files.data)
	# TFIDF
	tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TFIDF = tfidf_transformer.transform(BOW)

	# k in kfold
	n_cross_val = 5

	# calculate results
	i, uniform_results, weighted_results = KFOLD_KNN_parameter_test(TFIDF, files.target, n_cross_val = n_cross_val, n_neighbors = 5)

	# plot
	plot_results(i, [uniform_results, weighted_results], ['uniform', 'weighted'])
Example #5
0
def main_test(path=None):
    dir_path = path or 'dataset'

    remove_incompatible_files(dir_path)

    print('\n\n')

    # load data
    print((colored('Loading files into memory', 'green', attrs=['bold'])))
    files = sklearn.datasets.load_files(dir_path)

    # refine all emails
    print((colored('Refining all files', 'green', attrs=['bold'])))
    util.refine_all_emails(files.data)

    # calculate the BOW representation
    print((colored('Calculating BOW', 'green', attrs=['bold'])))
    word_counts = util.bagOfWords(files.data)

    # TFIDF
    print((colored('Calculating TFIDF', 'green', attrs=['bold'])))
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=True).fit(word_counts)
    X = tf_transformer.transform(word_counts)

    print('\n\n')

    # create classifier
    # clf = sklearn.naive_bayes.MultinomialNB()
    # clf = sklearn.svm.LinearSVC()
    n_neighbors = 11
    weights = 'uniform'
    weights = 'distance'
    clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

    # test the classifier
    print('\n\n')
    print((colored('Testing classifier with train-test split',
                   'magenta',
                   attrs=['bold'])))
    test_classifier(X,
                    files.target,
                    clf,
                    test_size=0.2,
                    y_names=files.target_names,
                    confusion=False)
Example #6
0
def KNN(path):
    print "Classifier: K Nearest Neighbors"
    print "Train-Test Split"

    # preprocess
    main.reorganize_dataset(path)
    main.remove_incompatible_files(path)

    # load data
    files = sklearn.datasets.load_files(path, shuffle=True)

    # refine emails - delete unwanted text form them
    util.refine_all_emails(files.data)

    # feature Extractoin
    # BOW
    BOW = util.bagOfWords(files.data)
    # TF
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TF = tf_transformer.transform(BOW)
    # TFIDF
    tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TFIDF = tfidf_transformer.transform(BOW)

    # build classifier
    n_neighbors = 5
    # weights = 'uniform'
    weights = 'distance'
    clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

    # calculate results
    i, BOW_results = split_test_classifier(clf, BOW, files.target)
    i, TF_results = split_test_classifier(clf, TF, files.target)
    i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target)

    # plot
    plot_results(i, [BOW_results, TF_results, TFIDF_results],
                 ['BOW', 'TF', 'TFIDF'])
def main_test(path = None):
	dir_path = path or 'dataset'

	remove_incompatible_files(dir_path)

	print '\n\n'

	# load data
	print colored('Loading files into memory', 'green', attrs=['bold'])
	files = sklearn.datasets.load_files(dir_path)

	# refine all emails
	print colored('Refining all files', 'green', attrs=['bold'])
	util.refine_all_emails(files.data)

	# calculate the BOW representation
	print colored('Calculating BOW', 'green', attrs=['bold'])
	word_counts = util.bagOfWords(files.data)

	# TFIDF
	print colored('Calculating TFIDF', 'green', attrs=['bold'])
	tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=True).fit(word_counts)
	X = tf_transformer.transform(word_counts)


	print '\n\n'

	# create classifier
	# clf = sklearn.naive_bayes.MultinomialNB()
	# clf = sklearn.svm.LinearSVC()
	n_neighbors = 11
	weights = 'uniform'
	weights = 'distance'
	clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

	# test the classifier
	print '\n\n'
	print colored('Testing classifier with train-test split', 'magenta', attrs=['bold'])
	test_classifier(X, files.target, clf, test_size=0.2, y_names=files.target_names, confusion=False)
Example #8
0
def SVM(path):
    print "Classifier: Support Vector Machine"
    print "Train-Test Split"

    # preprocess
    main.reorganize_dataset(path)
    main.remove_incompatible_files(path)

    # load data
    files = sklearn.datasets.load_files(path, shuffle=True)

    # refine emails - delete unwanted text form them
    util.refine_all_emails(files.data)

    # feature Extractoin
    # BOW
    BOW = util.bagOfWords(files.data)
    # TF
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TF = tf_transformer.transform(BOW)
    # TFIDF
    tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TFIDF = tfidf_transformer.transform(BOW)

    # build classifier
    clf = sklearn.svm.LinearSVC()

    # calculate results
    i, BOW_results = split_test_classifier(clf, BOW, files.target)
    i, TF_results = split_test_classifier(clf, TF, files.target)
    i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target)

    # plot
    plot_results(i, [BOW_results, TF_results, TFIDF_results],
                 ['BOW', 'TF', 'TFIDF'])
Example #9
0
def KNN(path):
	print "Classifier: K Nearest Neighbors"
	print "Train-Test Split"

	# preprocess
	main.reorganize_dataset(path)
	main.remove_incompatible_files(path)

	# load data
	files = sklearn.datasets.load_files(path, shuffle = True)

	# refine emails - delete unwanted text form them
	util.refine_all_emails(files.data)

	# feature Extractoin
	# BOW
	BOW = util.bagOfWords(files.data)
	# TF
	tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TF = tf_transformer.transform(BOW)
	# TFIDF
	tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TFIDF = tfidf_transformer.transform(BOW)

	# build classifier
	n_neighbors = 5
	# weights = 'uniform'
	weights = 'distance'
	clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

	# calculate results
	i, BOW_results = split_test_classifier(clf, BOW, files.target)
	i, TF_results = split_test_classifier(clf, TF, files.target)
	i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target)

	# plot
	plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])