Ejemplo n.º 1
0
def KNN_parameter(path):
    print "Classifier: K Nearest Neighbors"
    print "KFOLD parameter test"

    # preprocess
    main.reorganize_dataset(path)
    main.remove_incompatible_files(path)

    # load data
    files = sklearn.datasets.load_files(path, shuffle=True)

    # refine emails - delete unwanted text form them
    util.refine_all_emails(files.data)

    # feature Extractoin
    # BOW
    BOW = util.bagOfWords(files.data)
    # TFIDF
    tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TFIDF = tfidf_transformer.transform(BOW)

    # k in kfold
    n_cross_val = 5

    # calculate results
    i, uniform_results, weighted_results = KFOLD_KNN_parameter_test(
        TFIDF, files.target, n_cross_val=n_cross_val, n_neighbors=5)

    # plot
    plot_results(i, [uniform_results, weighted_results],
                 ['uniform', 'weighted'])
Ejemplo n.º 2
0
def NB(path):
	print ("Classifier: Naive Bayes")
	print ("Train-Test Split")

	# preprocess
	main.reorganize_dataset(path)
	main.remove_incompatible_files(path)

	# load data
	files = sklearn.datasets.load_files(path, shuffle = True)

	# refine emails - delete unwanted text form them
	util.refine_all_emails(files.data)

	# feature Extractoin
	# BOW
	BOW = util.bagOfWords(files.data)
	# TF
	tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TF = tf_transformer.transform(BOW)
	# TFIDF
	tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TFIDF = tfidf_transformer.transform(BOW)

	# build classifier
	clf = sklearn.naive_bayes.MultinomialNB()

	# calculate results
	i, BOW_results = split_test_classifier(clf, BOW, files.target)
	i, TF_results = split_test_classifier(clf, TF, files.target)
	i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target)
	
	# plot
	plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])
Ejemplo n.º 3
0
def KNN_parameter(path):
	print ("Classifier: K Nearest Neighbors")
	print ("KFOLD parameter test")

	# preprocess
	main.reorganize_dataset(path)
	main.remove_incompatible_files(path)

	# load data
	files = sklearn.datasets.load_files(path, shuffle = True)

	# refine emails - delete unwanted text form them
	util.refine_all_emails(files.data)

	# feature Extractoin
	# BOW
	BOW = util.bagOfWords(files.data)
	# TFIDF
	tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TFIDF = tfidf_transformer.transform(BOW)

	# k in kfold
	n_cross_val = 5

	# calculate results
	i, uniform_results, weighted_results = KFOLD_KNN_parameter_test(TFIDF, files.target, n_cross_val = n_cross_val, n_neighbors = 5)

	# plot
	plot_results(i, [uniform_results, weighted_results], ['uniform', 'weighted'])
Ejemplo n.º 4
0
def KNN(path):
    print "Classifier: K Nearest Neighbors"
    print "Train-Test Split"

    # preprocess
    main.reorganize_dataset(path)
    main.remove_incompatible_files(path)

    # load data
    files = sklearn.datasets.load_files(path, shuffle=True)

    # refine emails - delete unwanted text form them
    util.refine_all_emails(files.data)

    # feature Extractoin
    # BOW
    BOW = util.bagOfWords(files.data)
    # TF
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TF = tf_transformer.transform(BOW)
    # TFIDF
    tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TFIDF = tfidf_transformer.transform(BOW)

    # build classifier
    n_neighbors = 5
    # weights = 'uniform'
    weights = 'distance'
    clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

    # calculate results
    i, BOW_results = split_test_classifier(clf, BOW, files.target)
    i, TF_results = split_test_classifier(clf, TF, files.target)
    i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target)

    # plot
    plot_results(i, [BOW_results, TF_results, TFIDF_results],
                 ['BOW', 'TF', 'TFIDF'])
Ejemplo n.º 5
0
def SVM(path):
    print "Classifier: Support Vector Machine"
    print "Train-Test Split"

    # preprocess
    main.reorganize_dataset(path)
    main.remove_incompatible_files(path)

    # load data
    files = sklearn.datasets.load_files(path, shuffle=True)

    # refine emails - delete unwanted text form them
    util.refine_all_emails(files.data)

    # feature Extractoin
    # BOW
    BOW = util.bagOfWords(files.data)
    # TF
    tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TF = tf_transformer.transform(BOW)
    # TFIDF
    tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(
        use_idf=False).fit(BOW)
    TFIDF = tfidf_transformer.transform(BOW)

    # build classifier
    clf = sklearn.svm.LinearSVC()

    # calculate results
    i, BOW_results = split_test_classifier(clf, BOW, files.target)
    i, TF_results = split_test_classifier(clf, TF, files.target)
    i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target)

    # plot
    plot_results(i, [BOW_results, TF_results, TFIDF_results],
                 ['BOW', 'TF', 'TFIDF'])
Ejemplo n.º 6
0
def KNN(path):
	print "Classifier: K Nearest Neighbors"
	print "Train-Test Split"

	# preprocess
	main.reorganize_dataset(path)
	main.remove_incompatible_files(path)

	# load data
	files = sklearn.datasets.load_files(path, shuffle = True)

	# refine emails - delete unwanted text form them
	util.refine_all_emails(files.data)

	# feature Extractoin
	# BOW
	BOW = util.bagOfWords(files.data)
	# TF
	tf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TF = tf_transformer.transform(BOW)
	# TFIDF
	tfidf_transformer = sklearn.feature_extraction.text.TfidfTransformer(use_idf=False).fit(BOW)
	TFIDF = tfidf_transformer.transform(BOW)

	# build classifier
	n_neighbors = 5
	# weights = 'uniform'
	weights = 'distance'
	clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights)

	# calculate results
	i, BOW_results = split_test_classifier(clf, BOW, files.target)
	i, TF_results = split_test_classifier(clf, TF, files.target)
	i, TFIDF_results = split_test_classifier(clf, TFIDF, files.target)

	# plot
	plot_results(i, [BOW_results, TF_results, TFIDF_results], ['BOW', 'TF', 'TFIDF'])