def deteccionDeSpam(): words_dictionary = getVocabDict() spamMailsPath = getEmailsPathFromDirectory('./spam') hardHamMailsPath = getEmailsPathFromDirectory('./hard_ham') easyHamMailsPath = getEmailsPathFromDirectory('./easy_ham') X, y, Xval, yval, Xtest, ytest = getSets(spamMailsPath, hardHamMailsPath, easyHamMailsPath, words_dictionary) possible_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30] C_value, sigma = 0, 0 max_score = 0 best_svm = None for i in range(len(possible_values)): C_value = possible_values[i] for j in range(len(possible_values)): sigma = possible_values[j] svm = SVC(kernel='rbf', C=C_value, gamma=(1 / (2 * sigma**2))) svm.fit(X, y.ravel()) current_score = svm.score( Xval, yval.T ) #calcula el score con los ejemplos de validacion (mayor score, mejor es el svm) if current_score > max_score: max_score = current_score best_svm = svm print("Score de clasificar los ejemplos de prueba: " + str(best_svm.score(Xtest, ytest.T)))
def read_file(path): print("Leyendo archivos de ", path) files = [] # r=root, d=directories, f = files for r, d, f in os.walk(path): for file in f: if '.txt' in file: files.append(os.path.join(r, file)) j = 0 vocab_dict = vocab.getVocabDict() arrayDict = np.array(list(vocab_dict.items())) X = np.zeros((len(files), len(arrayDict))) for f in files: email_contents = codecs.open(f, "r", encoding="utf−8", errors="ignore").read() email = process_email.email2TokenList(email_contents) aux = np.zeros(len(arrayDict)) for i in range(len(email)): index = np.where(arrayDict[:, 0] == email[i]) aux[index] = 1 X[j] = aux j = j + 1 print("Archivos de ", path, "leídos y guardados en X.") return X
def dataManager(ini, fin, directoryName, yValue): X = np.empty((0, 1899)) # 60% de 500 Y = np.empty((0, 1)) for i in range(ini + 1, fin + 1): email_contents = codecs.open('{0}/{1:04d}.txt'.format( directoryName, i), 'r', encoding='utf-8', errors='ignore').read() email = email2TokenList(email_contents) wordsDict = getVocabDict() wordOcurrence = emailToWordOcurrence(email, wordsDict) X = np.vstack((X, wordOcurrence)) Y = np.vstack((Y, yValue)) return X, Y
def main(): np.set_printoptions(threshold=sys.maxsize) directorio = "spam" dicVoc = getVocabDict() num_spam = 500 num_easy_ham = 2501 num_hard_ham = 250 X = np.empty((0, 1900)) Xval = np.empty((0, 1900)) Xtest = np.empty((0, 1900)) X, Xval, Xtest = addX("spam", X, Xval, Xtest, num_spam, dicVoc) X, Xval, Xtest = addX("easy_ham", X, Xval, Xtest, num_easy_ham, dicVoc) X, Xval, Xtest = addX("hard_ham", X, Xval, Xtest, num_hard_ham, dicVoc) print(np.shape(X)) y = addY(num_spam * 0.7, (num_easy_ham + num_hard_ham) * 0.7) yval = addY(num_spam * 0.2, (num_easy_ham + num_hard_ham) * 0.2) ytest = addY(num_spam * 0.1, (num_easy_ham + num_hard_ham) * 0.1) print("X e Y añadido perfectamente") array = np.array([0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]) minimo = np.inf c, g = 0, 0 for i in array: print("C: ", i) for j in array: print("sigma: ", j) sigma = j svm = SVC(kernel='rbf', C=i, gamma=1 / (2 * sigma**2)) svm.fit(X, y.ravel()) pred = svm.predict(Xval) err = np.sum(pred != yval.ravel()) / np.size(yval.ravel(), 0) if (minimo > err): minimo = err c = i g = j svm = SVC(kernel='rbf', C=c, gamma=1 / (2 * g**2)) svm.fit(X, y.ravel()) print(svm.predict(Xtest)) err = np.sum(pred != ytest.ravel()) / np.size(ytest.ravel(), 0) print(err)
def transform_data( ): #used in console to save the email's vectors in a .mat file easy = np.empty((0, 1899)) hard = np.empty((0, 1899)) spam = np.empty((0, 1899)) dicti = gvd.getVocabDict() for i in np.arange(250): hard = np.vstack( (hard, process_email("hard_ham/{0:04d}.txt".format(i + 1), dicti))) spam = np.vstack( (spam, process_email("spam/{0:04d}.txt".format(i + 1), dicti))) easy = np.vstack( (easy, process_email("easy_ham/{0:04d}.txt".format(i + 1), dicti))) for i in np.arange(250, 500): spam = np.vstack( (spam, process_email("spam/{0:04d}.txt".format(i + 1), dicti))) easy = np.vstack( (easy, process_email("easy_ham/{0:04d}.txt".format(i + 1), dicti))) for i in np.arange(500, 2551): easy = np.vstack( (easy, process_email("easy_ham/{0:04d}.txt".format(i + 1), dicti))) spam = np.delete(spam, 339, 0) #email spam/0340.txt doesn't work my_dict = {"easy": easy, "hard": hard, "spam": spam} savemat("email_vectors.mat", my_dict)
# Drawing the decision border along with the data of the validation set plt.contour(xx1, xx2, h, [0.5], linewidths=1, colors='b') plt.scatter(Xval[pos, 0], Xval[pos, 1], marker='+', c='k') plt.scatter(Xval[not_pos, 0], Xval[not_pos, 1], marker='o', c='Y') plt.title('C = ' + str(Cmin) + ', sigma = ' + str(sigmamin) + ", on the validation set") plt.show() #%% """ part 2 : Spam detection """ # Loading the data dictionnary = getVocabDict() nwords = len(dictionnary) Xspam = load_email_data('data/spam', 500) yspam = np.ones((500, 1)) espam = np.zeros((500, 1)) Xeham = load_email_data('data/easy_ham', 2551) yeham = np.zeros((2551, 1)) eeham = np.ones((2551, 1)) Xhham = load_email_data('data/hard_ham', 250) yhham = np.zeros((250, 1)) ehham = np.ones((250, 1)) * 2 #%% """ Training a model on spam and easy ham
meshX1, meshX2 = np.meshgrid(meshX1, meshX2) meshY = svm.predict(np.array([meshX1.ravel(), meshX2.ravel()]).T).reshape(meshX1.shape) plt.figure(figsize=(10, 10)) plt.scatter(X[y == 0, 0], X[y == 0, 1], c='r', marker='o') plt.scatter(X[y == 1, 0], X[y == 1, 1], c='b', marker='o') plt.contour(meshX1, meshX2, meshY) plt.title(r"SVM con $C=" + str(C) + "$ y $\sigma=" + str(sigma) + "$") plt.savefig('P1.3.png') plt.show() #%% Parte 2 # Cargamos el diccionario de palabras dic = getVocabDict() # Leemos y procesamos los datos correspondientes a spam spam = np.zeros((len(os.listdir('spam')), len(dic))) for i, filename in enumerate(os.listdir('spam')): email_contents = codecs.open('spam/' + filename, 'r', encoding='utf-8', errors='ignore').read() email_tokens = email2TokenList(email_contents) for token in email_tokens: if token in dic.keys(): spam[i, dic[token] - 1] = 1 # Leemos y procesamos los datos correspondientes al easy ham easy = np.zeros((len(os.listdir('easy_ham')), len(dic)))
from process_email import email2TokenList import codecs from get_vocab_dict import getVocabDict import numpy as np import os from sklearn import svm import matplotlib.pyplot as plt vocab_dict = getVocabDict() def convertToIndices(token): indicesOfWords = [vocab_dict[t] for t in token if t in vocab_dict] result = np.zeros((len(vocab_dict), 1)) for index in indicesOfWords: result[index-1] = 1 return result def read_spam(): spam_emails = [] directorio = "spam" i = 1 for spam_email in os.listdir(directorio): email_contents = codecs.open( '{0}/{1:04d}.txt'.format(directorio, i), 'r', encoding='utf-8', errors='ignore').read() tokens = email2TokenList(email_contents) tokens = convertToIndices(tokens) i += 1