Ejemplo n.º 1
0
def deteccionDeSpam():
    words_dictionary = getVocabDict()
    spamMailsPath = getEmailsPathFromDirectory('./spam')
    hardHamMailsPath = getEmailsPathFromDirectory('./hard_ham')
    easyHamMailsPath = getEmailsPathFromDirectory('./easy_ham')
    X, y, Xval, yval, Xtest, ytest = getSets(spamMailsPath, hardHamMailsPath,
                                             easyHamMailsPath,
                                             words_dictionary)

    possible_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30]
    C_value, sigma = 0, 0
    max_score = 0
    best_svm = None

    for i in range(len(possible_values)):
        C_value = possible_values[i]
        for j in range(len(possible_values)):
            sigma = possible_values[j]
            svm = SVC(kernel='rbf', C=C_value, gamma=(1 / (2 * sigma**2)))
            svm.fit(X, y.ravel())
            current_score = svm.score(
                Xval, yval.T
            )  #calcula el score con los ejemplos de validacion (mayor score, mejor es el svm)
            if current_score > max_score:
                max_score = current_score
                best_svm = svm

    print("Score de clasificar los ejemplos de prueba: " +
          str(best_svm.score(Xtest, ytest.T)))
Ejemplo n.º 2
0
def read_file(path):

    print("Leyendo archivos de ", path)

    files = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            if '.txt' in file:
                files.append(os.path.join(r, file))

    j = 0
    vocab_dict = vocab.getVocabDict()

    arrayDict = np.array(list(vocab_dict.items()))

    X = np.zeros((len(files), len(arrayDict)))

    for f in files:
        email_contents = codecs.open(f, "r", encoding="utf−8", errors="ignore").read()

        email = process_email.email2TokenList(email_contents)

        aux = np.zeros(len(arrayDict))

        for i in range(len(email)):
            index = np.where(arrayDict[:, 0] == email[i])
            aux[index] = 1

        X[j] = aux
        j = j + 1

    print("Archivos de ", path, "leídos y guardados en X.")
    return X
Ejemplo n.º 3
0
def dataManager(ini, fin, directoryName, yValue):
    X = np.empty((0, 1899))  # 60% de 500
    Y = np.empty((0, 1))

    for i in range(ini + 1, fin + 1):
        email_contents = codecs.open('{0}/{1:04d}.txt'.format(
            directoryName, i),
                                     'r',
                                     encoding='utf-8',
                                     errors='ignore').read()
        email = email2TokenList(email_contents)

        wordsDict = getVocabDict()

        wordOcurrence = emailToWordOcurrence(email, wordsDict)
        X = np.vstack((X, wordOcurrence))
        Y = np.vstack((Y, yValue))

    return X, Y
Ejemplo n.º 4
0
def main():
    np.set_printoptions(threshold=sys.maxsize)
    directorio = "spam"
    dicVoc = getVocabDict()
    num_spam = 500
    num_easy_ham = 2501
    num_hard_ham = 250
    X = np.empty((0, 1900))
    Xval = np.empty((0, 1900))
    Xtest = np.empty((0, 1900))
    X, Xval, Xtest = addX("spam", X, Xval, Xtest, num_spam, dicVoc)
    X, Xval, Xtest = addX("easy_ham", X, Xval, Xtest, num_easy_ham, dicVoc)
    X, Xval, Xtest = addX("hard_ham", X, Xval, Xtest, num_hard_ham, dicVoc)

    print(np.shape(X))
    y = addY(num_spam * 0.7, (num_easy_ham + num_hard_ham) * 0.7)
    yval = addY(num_spam * 0.2, (num_easy_ham + num_hard_ham) * 0.2)
    ytest = addY(num_spam * 0.1, (num_easy_ham + num_hard_ham) * 0.1)

    print("X e Y añadido perfectamente")

    array = np.array([0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30])
    minimo = np.inf
    c, g = 0, 0
    for i in array:
        print("C: ", i)
        for j in array:
            print("sigma: ", j)
            sigma = j
            svm = SVC(kernel='rbf', C=i, gamma=1 / (2 * sigma**2))
            svm.fit(X, y.ravel())
            pred = svm.predict(Xval)
            err = np.sum(pred != yval.ravel()) / np.size(yval.ravel(), 0)
            if (minimo > err):
                minimo = err
                c = i
                g = j
    svm = SVC(kernel='rbf', C=c, gamma=1 / (2 * g**2))
    svm.fit(X, y.ravel())
    print(svm.predict(Xtest))
    err = np.sum(pred != ytest.ravel()) / np.size(ytest.ravel(), 0)
    print(err)
Ejemplo n.º 5
0
def transform_data(
):  #used in console to save the email's vectors in a .mat file
    easy = np.empty((0, 1899))
    hard = np.empty((0, 1899))
    spam = np.empty((0, 1899))
    dicti = gvd.getVocabDict()
    for i in np.arange(250):
        hard = np.vstack(
            (hard, process_email("hard_ham/{0:04d}.txt".format(i + 1), dicti)))
        spam = np.vstack(
            (spam, process_email("spam/{0:04d}.txt".format(i + 1), dicti)))
        easy = np.vstack(
            (easy, process_email("easy_ham/{0:04d}.txt".format(i + 1), dicti)))
    for i in np.arange(250, 500):
        spam = np.vstack(
            (spam, process_email("spam/{0:04d}.txt".format(i + 1), dicti)))
        easy = np.vstack(
            (easy, process_email("easy_ham/{0:04d}.txt".format(i + 1), dicti)))
    for i in np.arange(500, 2551):
        easy = np.vstack(
            (easy, process_email("easy_ham/{0:04d}.txt".format(i + 1), dicti)))
    spam = np.delete(spam, 339, 0)  #email spam/0340.txt doesn't work
    my_dict = {"easy": easy, "hard": hard, "spam": spam}
    savemat("email_vectors.mat", my_dict)
Ejemplo n.º 6
0
# Drawing the decision border along with the data of the validation set
plt.contour(xx1, xx2, h, [0.5], linewidths=1, colors='b')
plt.scatter(Xval[pos, 0], Xval[pos, 1], marker='+', c='k')
plt.scatter(Xval[not_pos, 0], Xval[not_pos, 1], marker='o', c='Y')
plt.title('C = ' + str(Cmin) + ', sigma = ' + str(sigmamin) +
          ", on the validation set")
plt.show()

#%%
"""
part 2 : Spam detection
"""

# Loading the data
dictionnary = getVocabDict()
nwords = len(dictionnary)

Xspam = load_email_data('data/spam', 500)
yspam = np.ones((500, 1))
espam = np.zeros((500, 1))
Xeham = load_email_data('data/easy_ham', 2551)
yeham = np.zeros((2551, 1))
eeham = np.ones((2551, 1))
Xhham = load_email_data('data/hard_ham', 250)
yhham = np.zeros((250, 1))
ehham = np.ones((250, 1)) * 2

#%%
"""
Training a model on spam and easy ham
Ejemplo n.º 7
0
meshX1, meshX2 = np.meshgrid(meshX1, meshX2)
meshY = svm.predict(np.array([meshX1.ravel(),
                              meshX2.ravel()]).T).reshape(meshX1.shape)

plt.figure(figsize=(10, 10))
plt.scatter(X[y == 0, 0], X[y == 0, 1], c='r', marker='o')
plt.scatter(X[y == 1, 0], X[y == 1, 1], c='b', marker='o')
plt.contour(meshX1, meshX2, meshY)
plt.title(r"SVM con $C=" + str(C) + "$ y $\sigma=" + str(sigma) + "$")
plt.savefig('P1.3.png')
plt.show()

#%% Parte 2

# Cargamos el diccionario de palabras
dic = getVocabDict()

# Leemos y procesamos los datos correspondientes a spam
spam = np.zeros((len(os.listdir('spam')), len(dic)))
for i, filename in enumerate(os.listdir('spam')):
    email_contents = codecs.open('spam/' + filename,
                                 'r',
                                 encoding='utf-8',
                                 errors='ignore').read()
    email_tokens = email2TokenList(email_contents)
    for token in email_tokens:
        if token in dic.keys():
            spam[i, dic[token] - 1] = 1

# Leemos y procesamos los datos correspondientes al easy ham
easy = np.zeros((len(os.listdir('easy_ham')), len(dic)))
Ejemplo n.º 8
0
from process_email import email2TokenList
import codecs
from get_vocab_dict import getVocabDict
import numpy as np
import os
from sklearn import svm
import matplotlib.pyplot as plt


vocab_dict = getVocabDict()


def convertToIndices(token):

    indicesOfWords = [vocab_dict[t] for t in token if t in vocab_dict]
    result = np.zeros((len(vocab_dict), 1))
    for index in indicesOfWords:
        result[index-1] = 1
    return result


def read_spam():
    spam_emails = []
    directorio = "spam"
    i = 1
    for spam_email in os.listdir(directorio):
        email_contents = codecs.open(
            '{0}/{1:04d}.txt'.format(directorio, i), 'r', encoding='utf-8', errors='ignore').read()
        tokens = email2TokenList(email_contents)
        tokens = convertToIndices(tokens)
        i += 1