Example #1
0
def processEmail(email_contents):
    vocabList = getVocabList()
    word_indices = []

    # 处理邮件内容
    email_contents = email_contents.lower()
    email_contents, _ = re.subn(r'<[^<>]+>', ' ', email_contents)
    email_contents, _ = re.subn(r'[0-9]+', 'number', email_contents)
    email_contents, _ = re.subn(r'(http|https)://[^\s]*', 'httpaddr',
                                email_contents)
    email_contents, _ = re.subn(r'[^\s]+@[^\s]+', 'emailaddr', email_contents)
    email_contents, _ = re.subn(r'[$]+', 'dollar', email_contents)
    # print email_contents
    # 还需要提取单词主干,把双数,ing等去掉, 使用nltk模块即可
    from nltk.stem import PorterStemmer

    if email_contents != '':
        re_words = re.findall(r'[A-Za-z]+', email_contents)
        # print re_words
        for word in re_words:
            word = PorterStemmer().stem(word)
            # word, _ = re.subn(r'[^a-zA-Z0-9]', '', word)
            # print word
            for i in range(len(vocabList)):
                if vocabList[i] == word:
                    word_indices.append(i)
        # print len(word_indices)
        # print word_indices
    return word_indices
def processEmail(email_contents):
    vocab_list = getVocabList()
    print(len(vocab_list))

    # process input
    email_contents = email_contents.lower()
    # email_contents = email_contents.replace("<[^<>]+>', ' ", "")
    # email_contents = re.sub("[+>', ']", "", email_contents)

    # handle numbers
    email_contents = re.sub("[0-9]+", "number", email_contents)

    # handle URL
    email_contents = re.sub("(http|https)://[^\s]*", "httpaddr",
                            email_contents)

    email_contents = re.sub("[^\s]+@[^\s]+", "emailaddr", email_contents)
    email_contents = re.sub("[$]+", "dollar", email_contents)

    # token email
    # remove punctuations
    # @$ /  # .-:&*+=[]?!(){},''
    l = re.compile(r'@$ /  # .-:&*\+=\[]?!(){},' '').split(email_contents)
    print(l)
    print(len(l))
    email_contents = re.sub(r' @$ /  # .-:&*\+=\[]?!(){},'
                            '', " ", email_contents)

    print(email_contents)
Example #3
0
def ProcessEmail(email):
    # Stripping email
    email = email.lower()  # re.sub is used to REPLACE a matched character
    email = re.sub(
        '(http|https)://[^\s]*', 'httpaddr',
        email)  # [\s] matches whitespace// [^\s] matches non-white spaces
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr',
                   email)  # matches any character with @ in-between
    email = re.sub('[<>?,.:/]+', ' ', email)  # matches characters inside []
    email = re.sub('[0-9]+', 'number', email)  # matches numbers from 0-9
    email = re.sub('[$]+', 'dollar ', email)  # matches $
    email = re.sub('[\s]+', ' ', email)  # matches whitespaces \n,\t,\.....

    print("Processed e-mail :\n\n", email)

    ######################
    # Tokenizing Processed email
    tokens = email.split()  # split email to individual tokens or words
    stemmer = nltk.PorterStemmer()  # defining stemmer for use

    ######################
    word_index = [
    ]  # [] defines a list or normal array # {} defines a dictionary or associative array
    vocab_dict = getVocabList(
    )  # dictionary holds key:value, pairs...refer website

    # Indexing email corrosponding to vocab_dict
    for token in tokens:
        token = stemmer.stem(
            token
        )  # stemming     # use token.strip() to be safe....here the email is already stripped from all possible characters

        if token in vocab_dict:  # indexing email with Vocubulary dictionary
            word_index.append(
                int(vocab_dict[token]
                    ))  # use append to add element to empty list/array
            # using int is VERY IMPORTANT. else it will store as characters eg. '86','916'....
    return word_index, vocab_dict
p = svmPredict(model, Xtest)

print('Test Accuracy: %f\n'%(np.mean(np.double(p == ytest)) * 100))
input('Program paused. Press enter to continue.\n')

## ================= Part 5: Top Predictors of Spam ====================
#  Since the model we are training is a linear SVM, we can inspect the
#  weights learned by the model to understand better how it is determining
#  whether an email is spam or not. The following code finds the words with
#  the highest weights in the classifier. Informally, the classifier
#  'thinks' that these words are the most likely indicators of spam.
#

# Sort the weights and obtin the vocabulary list
idx = np.argsort(-model['w'], axis=0)
vocabList = getVocabList()

print('\nTop predictors of spam: \n')
for i in range(15):
    print(' %-15s (%f) \n'%(vocabList[idx[i][0]], model['w'][idx[i][0]]))

print('\n\n')
print('\nProgram paused. Press enter to continue.\n')
input('Program paused. Press enter to continue.\n')

## =================== Part 6: Try Your Own Emails =====================
#  Now that you've trained the spam classifier, you can use it on your own
#  emails! In the starter code, we have included spamSample1.txt,
#  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. 
#  The following code reads in one of these emails and then uses your 
#  learned SVM classifier to determine whether the email is Spam or 
def processEmail(email_contents):

    # Load Vocabulary
    vocabList = getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub(r"<[^<>]+>", " ", email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub(r"[0-9]+", "number", email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub(r"(http|https)://[^\s]*", "httpaddr",
                            email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub(r"[^\s]+@[^\s]+", "emailaddr", email_contents)

    # Handle $ sign
    email_contents = re.sub(r"[$]+", "dollar", email_contents)

    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n\n')

    # Process file
    l = 0

    strs = re.split(r'[ `\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?\n\t]',
                    email_contents)

    p = PorterStemmer()
    for _str in strs:

        # Remove any non alphanumeric characters
        _str = re.sub(r"[^a-zA-Z0-9]", "", _str)

        # Stem the word
        # (the porterStemmer sometimes has issues, so we use a try catch block)
        _str = p.stem(_str)

        # Skip the word if it is too short
        if len(_str) < 1:
            continue

        for i in range(len(vocabList)):
            if _str == vocabList[i]:
                word_indices.append(i)
                break

        if l + len(_str) + 1 > 78:
            print('')
            l = 0

        print(_str + " ", end='')

    # Print footer
    print('\n\n=========================\n')
    return word_indices
## ================= Part 5: Top Predictors of Spam ====================
#  Since the model we are training is a linear SVM, we can inspect the
#  weights learned by the model to understand better how it is determining
#  whether an email is spam or not. The following code finds the words with
#  the highest weights in the classifier. Informally, the classifier
#  'thinks' that these words are the most likely indicators of spam.
#

# Sort the weights and obtain the vocabulary list
w = model.coef_[0]

# from http://stackoverflow.com/a/16486305/583834
# reverse sorting by index
indices = w.argsort()[::-1][:15]
vocabList = sorted(gvl.getVocabList().keys())

print('\nTop predictors of spam: \n');
for idx in indices: 
    print( ' {:s} ({:f}) '.format( vocabList[idx], float(w[idx]) ) )

raw_input('Program paused. Press enter to continue.')

## =================== Part 6: Try Your Own Emails =====================
#  Now that you've trained the spam classifier, you can use it on your own
#  emails! In the starter code, we have included spamSample1.txt,
#  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. 
#  The following code reads in one of these emails and then uses your 
#  learned SVM classifier to determine whether the email is Spam or 
#  Not Spam
Example #7
0
## ================= Part 5: Top Predictors of Spam ====================
#  Since the model we are training is a linear SVM, we can inspect the
#  weights learned by the model to understand better how it is determining
#  whether an email is spam or not. The following code finds the words with
#  the highest weights in the classifier. Informally, the classifier
#  'thinks' that these words are the most likely indicators of spam.
#

# Sort the weights and obtain the vocabulary list
w = model.coef_[0]

# from http://stackoverflow.com/a/16486305/583834
# reverse sorting by index
indices = w.argsort()[::-1][:15]
vocabList = sorted(getVocabList().keys())

print('\nTop predictors of spam: \n')
for idx in indices:
    print(' {:s} ({:f}) '.format(vocabList[idx], float(w[idx])))

input('Program paused. Press <Enter> to continue...')

## =================== Part 6: Try Your Own Emails =====================
#  Now that you've trained the spam classifier, you can use it on your own
#  emails! In the starter code, we have included spamSample1.txt,
#  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
#  The following code reads in one of these emails and then uses your
#  learned SVM classifier to determine whether the email is Spam or
#  Not Spam
def processEmail(email_contents):
    # Load Vocabulary
    vocabList = getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    rx = re.compile('<[^<>]+>|\n')
    email_contents = rx.sub(' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    rx = re.compile('[0-9]+')
    email_contents = rx.sub('number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    rx = re.compile('(http|https)://[^\s]*')
    email_contents = rx.sub('httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    rx = re.compile('[^\s]+@[^\s]+')
    email_contents = rx.sub('emailaddr', email_contents)

    # Handle $ sign
    rx = re.compile('[$]+')
    email_contents = rx.sub('dollar', email_contents)

    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n=== Processed Email ====')

    # Process file
    l = 0

    # Remove any non alphanumeric characters
    rx = re.compile('[^a-zA-Z0-9 ]')
    email_contents = rx.sub('', email_contents).split()

    print(email_contents)

    for word in email_contents:
        # Stem the word
        # (the porterStemmer sometimes has issues, so we use a try catch block)
        try:
            word = porterStemmer(word.strip())
        except:
            word = ''
            continue

        # Skip the word if it is too short
        if len(word) < 1:
            continue

        # Look up the word in the dictionary and add to word_indices if
        # found
        if word in vocabList:
            word_indices.append(vocabList.index(word))

        # Print to screen, ensuring that the output lines are not too long
        if l + len(word) + l > 78:
            print(word)
            l = 0
        else:
            print(word, end=' ')
            l = l + len(word) + 1
    print('\n=========================\n')

    return word_indices
Example #9
0
def processEmail(email_contents):
    """preprocesses a the body of an email and
    returns a list of word_indices
    word_indices = PROCESSEMAIL(email_contents) preprocesses
    the body of an email and returns a list of indices of the
    words contained in the email.
    """

# Load Vocabulary
    vocabList = getVocabList()

# Init return value
    word_indices = []

# ========================== Preprocess Email ===========================

# Find the Headers ( \n\n and remove )
# Uncomment the following lines if you are working with raw emails with the
# full headers

# hdrstart = strfind(email_contents, ([chr(10) chr(10)]))
# email_contents = email_contents(hdrstart(1):end)

# Lower case
    email_contents = lower(email_contents)

# Strip all HTML
# Looks for any expression that starts with < and ends with > and replace
# and does not have any < or > in the tag it with a space
    rx = re.compile('<[^<>]+>|\n')
    email_contents = rx.sub(' ', email_contents)
# Handle Numbers
# Look for one or more characters between 0-9
    rx = re.compile('[0-9]+')
    email_contents = rx.sub('number ', email_contents)

# Handle URLS
# Look for strings starting with http:// or https://
    rx = re.compile('(http|https)://[^\s]*')
    email_contents = rx.sub('httpaddr ', email_contents)

# Handle Email Addresses
# Look for strings with @ in the middle
    rx = re.compile('[^\s]+@[^\s]+')
    email_contents = rx.sub('emailaddr ', email_contents)

# Handle $ sign
    rx = re.compile('[$]+')
    email_contents = rx.sub('dollar ', email_contents)

# ========================== Tokenize Email ===========================

# Output the email to screen as well
    print '==== Processed Email ====\n'

# Process file
    l = 0

# Remove any non alphanumeric characters
    rx = re.compile('[^a-zA-Z0-9 ]')
    email_contents = rx.sub('', email_contents).split()

    for str in email_contents:

        # Tokenize and also get rid of any punctuation
        # str = re.split('[' + re.escape(' @$/#.-:&*+=[]?!(){},''">_<#')
        #                                + chr(10) + chr(13) + ']', str)

        # Stem the word
        # (the porterStemmer sometimes has issues, so we use a try catch block)
        try:
            str = porterStemmer(str.strip())
        except:
            str = ''
            continue

        # Skip the word if it is too short
        if len(str) < 1:
           continue

        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable str. You should look up str in the
        #               vocabulary list (vocabList). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if str = 'action', then you should
        #               look up the vocabulary list to find where in vocabList
        #               'action' appears. For example, if vocabList{18} =
        #               'action', then, you should add 18 to the word_indices
        #               vector (e.g., word_indices = [word_indices  18] ).
        #
        # Note: vocabList{idx} returns a the word with index idx in the
        #       vocabulary list.
        #
        # Note: You can use strcmp(str1, str2) to compare two strings (str1 and
        #       str2). It will return 1 only if the two strings are equivalent.
        #




        # =============================================================

        # Print to screen, ensuring that the output lines are not too long
        if (l + len(str) + 1) > 78:
            print str
            l = 0
        else:
            print str,
            l = l + len(str) + 1

# Print footer
    print '\n========================='
    return word_indices
Example #10
0
def ex6_spam():
    ## Machine Learning Online Class
    #  Exercise 6 | Spam Classification with SVMs
    #
    #  Instructions
    #  ------------
    # 
    #  This file contains code that helps you get started on the
    #  exercise. You will need to complete the following functions:
    #
    #     gaussianKernel.m
    #     dataset3Params.m
    #     processEmail.m
    #     emailFeatures.m
    #
    #  For this exercise, you will not need to change any code in this file,
    #  or any other files other than those mentioned above.
    #

    ## Initialization
    #clear ; close all; clc

    ## ==================== Part 1: Email Preprocessing ====================
    #  To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
    #  to convert each email into a vector of features. In this part, you will
    #  implement the preprocessing steps for each email. You should
    #  complete the code in processEmail.m to produce a word indices vector
    #  for a given email.

    print('\nPreprocessing sample email (emailSample1.txt)')

    # Extract Features
    file_contents = readFile('emailSample1.txt')
    word_indices  = processEmail(file_contents)

    # Print Stats
    print('Word Indices: ')
    print(formatter(' %d', np.array(word_indices) + 1))
    print('\n')

    print('Program paused. Press enter to continue.')
    #pause;

    ## ==================== Part 2: Feature Extraction ====================
    #  Now, you will convert each email into a vector of features in R^n. 
    #  You should complete the code in emailFeatures.m to produce a feature
    #  vector for a given email.

    print('\nExtracting features from sample email (emailSample1.txt)')

    # Extract Features
    file_contents = readFile('emailSample1.txt')
    word_indices  = processEmail(file_contents)
    features      = emailFeatures(word_indices)

    # Print Stats
    print('Length of feature vector: %d' % features.size)
    print('Number of non-zero entries: %d' % np.sum(features > 0))

    print('Program paused. Press enter to continue.')
    #pause;

    ## =========== Part 3: Train Linear SVM for Spam Classification ========
    #  In this section, you will train a linear classifier to determine if an
    #  email is Spam or Not-Spam.

    # Load the Spam Email dataset
    # You will have X, y in your environment
    mat = scipy.io.loadmat('spamTrain.mat')
    X = mat['X'].astype(float)
    y = mat['y'][:, 0]

    print('\nTraining Linear SVM (Spam Classification)\n')
    print('(this may take 1 to 2 minutes) ...\n')

    C = 0.1
    model = svmTrain(X, y, C, linearKernel)

    p = svmPredict(model, X)

    print('Training Accuracy: %f' % (np.mean(p == y) * 100))

    ## =================== Part 4: Test Spam Classification ================
    #  After training the classifier, we can evaluate it on a test set. We have
    #  included a test set in spamTest.mat

    # Load the test dataset
    # You will have Xtest, ytest in your environment
    mat = scipy.io.loadmat('spamTest.mat')
    Xtest = mat['Xtest'].astype(float)
    ytest = mat['ytest'][:, 0]

    print('\nEvaluating the trained Linear SVM on a test set ...\n')

    p = svmPredict(model, Xtest)

    print('Test Accuracy: %f\n' % (np.mean(p == ytest) * 100))
    #pause;


    ## ================= Part 5: Top Predictors of Spam ====================
    #  Since the model we are training is a linear SVM, we can inspect the
    #  weights learned by the model to understand better how it is determining
    #  whether an email is spam or not. The following code finds the words with
    #  the highest weights in the classifier. Informally, the classifier
    #  'thinks' that these words are the most likely indicators of spam.
    #

    # Sort the weights and obtin the vocabulary list
    idx = np.argsort(model['w'])
    top_idx = idx[-15:][::-1]
    vocabList = getVocabList()

    print('\nTop predictors of spam: ')
    for word, w in zip(np.array(vocabList)[top_idx], model['w'][top_idx]):
        print(' %-15s (%f)' % (word, w))
    #end

    print('\n')
    print('\nProgram paused. Press enter to continue.')
    #pause;

    ## =================== Part 6: Try Your Own Emails =====================
    #  Now that you've trained the spam classifier, you can use it on your own
    #  emails! In the starter code, we have included spamSample1.txt,
    #  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples. 
    #  The following code reads in one of these emails and then uses your 
    #  learned SVM classifier to determine whether the email is Spam or 
    #  Not Spam

    # Set the file to be read in (change this to spamSample2.txt,
    # emailSample1.txt or emailSample2.txt to see different predictions on
    # different emails types). Try your own emails as well!
    filename = 'spamSample1.txt'

    # Read and predict
    file_contents = readFile(filename)
    word_indices  = processEmail(file_contents)
    x             = emailFeatures(word_indices)
    p = svmPredict(model, x.ravel())

    print('\nProcessed %s\n\nSpam Classification: %d' % (filename, p))
    print('(1 indicates spam, 0 indicates not spam)\n')
Example #11
0
p = model.predict(X)
print('Training Accuracy: ', np.mean(np.double(p == y.ravel())) * 100)

# =================== Part 4: Test Spam Classification ================

data = loadmat("data/spamTest.mat")
Xtest = data['Xtest']
ytest = data['ytest']
p = model.predict(Xtest)
print('Test Accuracy: ', np.mean(np.double(p == ytest.ravel())) * 100)

# ================= Part 5: Top Predictors of Spam ====================

w = model.coef_[0]
idx = np.argsort(w)[::-1][:15]
vocabList = list(getVocabList().keys())
print('Top predictors of spam: ')
for i in idx:
    print("{:15s} {:.3f}".format(vocabList[i], w[i]))

# =================== Part 6: Try Your Own Emails =====================

with open('data/emailSample1.txt', 'r') as file:
    file_contents = file.read()
file.close()
word_indices = processEmail(file_contents)
x = emailFeatures(word_indices)
p = model.predict(x.T)
print('Spam Classification: ', p)
print('(1 indicates spam, 0 indicates not spam)')
Example #12
0
def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices 
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses 
    #   the body of an email and returns a list of indices of the 
    #   words contained in the email. 
    #

    # Load Vocabulary
    vocabList = gvl.getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = email_contents.find("\n\n")
    # if hdrstart:
    #     email_contents = email_contents[hdrstart:]

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)


    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n\n')

    # Process file
    l = 0

    # Slightly different order from matlab version

    # Split and also get rid of any punctuation
    # regex may need further debugging...
    email_contents = re.split(r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s\n\r\t]+', email_contents)

    for token in email_contents:

        # Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token)

        # Stem the word 
        token = PorterStemmer().stem_word(token.strip())

        # Skip the word if it is too short
        if len(token) < 1:
           continue

        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable str. You should look up str in the
        #               vocabulary list (vocabList). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if str = 'action', then you should
        #               look up the vocabulary list to find where in vocabList
        #               'action' appears. For example, if vocabList{18} =
        #               'action', then, you should add 18 to the word_indices 
        #               vector (e.g., word_indices = [word_indices ; 18]; ).
        # 
        # Note: vocabList{idx} returns a the word with index idx in the
        #       vocabulary list.
        # 
        # Note: You can use strcmp(str1, str2) to compare two strings (str1 and
        #       str2). It will return 1 only if the two strings are equivalent.
        #

        idx = vocabList[token] if token in vocabList else 0

        # only add entries which are in vocabList
        #   i.e. those with ind ~= 0, 
        #        given that ind is assigned 0 if str is not found in vocabList
        if idx > 0:
            word_indices.append(idx)

        # =============================================================


        # Print to screen, ensuring that the output lines are not too long
        if l + len(token) + 1 > 78:
            print("")
            l = 0
        print('{:s}'.format(token)),
        l = l + len(token) + 1

    # Print footer
    print('\n\n=========================\n')

    return word_indices
def processEmail(email_contents = "") :
    """
    processes the email body to return the email in its root form
    """

    vocabList = getVocabList.getVocabList()
    word_indices = []

# ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with
    # the full headers

    email_contents = str(email_contents)

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space

    email_contents = re.sub('<[^<>]+>',' ', email_contents)


    # Handle Numbers
    # Look for one or more characters between 0-9
    # email_contents = regexprep(email_contents, '[0-9]+', 'number');

    email_contents = re.sub('[0-9]+','number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://

    email_contents = re.sub('(http|https)://[^\s]*','httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle

    email_contents = re.sub('[^\s]+@[^\s]+','emailaddr', email_contents)

    # Handle $ sign

    email_contents = re.sub('[$]+','dollar', email_contents)

# ========================== Tokenize Email ===========================

    # Output the email to screen as well
   
    delimiters = ' ' , '@' ,' $', '|', '/', '#', '.', '-' ,':' ,'&', '*', '+', '=', '[', ']', '?',\
                 '!', '(', ')', '{', '}', ',' , "'", '"', '>', '_' ,'<', '|', ';' ,'%' , "\n", "\t"     

    regexPattern = '|'.join(map(re.escape, delimiters))

    dic = re.split(regexPattern, email_contents)
  
    for i in range (len(dic)) :
        if len(dic[i]) > 0 :
            
            dic[i] = re.sub('[^a-zA-Z0-9]', '', dic[i])
            dic[i] = stem(dic[i])

            for j in range (len(vocabList)):
                if dic[i] == vocabList[j] :
                    word_indices.append(j)
 
    return word_indices
def main() :

    path = os.getcwd()
    path = os.path.join(path,'dataSets')

# ===============  Part 1 ====================
    #  To use an SVM to classify emails into Spam v.s. Non-Spam, we first need
    #  to convert each email into a vector of features. In this part, we
    #  implement the preprocessing steps for each email. 

    f = open(os.path.join(path,"emailSample1.txt"),'r')
    email_contents = f.read()
    f.close() 

    print(email_contents)

    word_indices = processEmail.processEmail(email_contents)
    features = emailFeatures(word_indices)
    
    print('Word Indices :\n')
    print(word_indices, "\n")


#=============  Part 2  =======================
    # Print Stats

    print('Length of feature vector: %d\n'  % len(features))
    print('Number of non-zero entries: %d\n' % sum(features))



# =============  Part 3  ======================

    #  In this section, we will train a linear classifier to determine if an
    #  email is Spam or Not-Spam.


    print('\n\nRunning SVM on training set...')

    mat = io.loadmat(os.path.join(path,'spamTrain.mat'))

    X = mat['X']
    y = mat['y']
    
    y = numpy.ravel(y)

    model = svm.SVC(C = 0.1, kernel='linear')
    model.fit(X, y)

    p = model.predict(X)

    accuracy = model.score(X, y)
    accuracy *= 100.0

    print('\nTraining Accuracy: %.2f' % accuracy)
   
#================ Part 4 ========================

    # Xtest and ytest are the env. variables
    mat = io.loadmat(os.path.join(path,'spamTest.mat'))

    XTest = mat['Xtest']
    yTest = mat['ytest']
    
    yTest = numpy.ravel(yTest)

    p = model.predict(XTest)

    accuracy = model.score(XTest,yTest)
    accuracy *= 100.0

    print('\nTest Accuracy: %.2f' % accuracy)
   
#================ Part 5 ============================
    #  Since the model we are training is a linear SVM, we can inspect the
    #  weights learned by the model to understand better how it is determining
    #  whether an email is spam or not. The following code finds the words with
    #  the highest weights in the classifier. Informally, the classifier
    #  'thinks' that these words are the most likely indicators of spam.
    

    print('\nTop spam predictors (keywords) \n')

    z = model.coef_
    z = numpy.ravel(z)

    vocabList = getVocabList.getVocabList()

    dic = {}
    for i in range(len(z)) :
        dic[ vocabList[i] ] = z[i]

    cnt = 0
    for w in sorted(dic, key=dic.get, reverse=True):
      if cnt == 15 :
          break 
      cnt = cnt + 1
      print('{0:10} - {1:10f}'.format(w, dic[w]))

    print('\n\n')

# ============ Part 6: Test a sample Email =====================
    #  Now that we have trained the spam classifier, we can use it on our own
    #  emails! 
    #  The following code reads in one of these emails and then uses our 
    #  learned SVM classifier to determine whether the email is Spam or 
    #  Not Spam
    
    f = open(os.path.join(path, "spamSample1.txt"),'r')
    email_contents = f.read()
    f.close() 

    print('Sample Email : ')
    print(email_contents)

    word_indices = processEmail.processEmail(email_contents)
    
    features = emailFeatures(word_indices)
    X = emailFeatures(word_indices);
    
    p = model.predict(X)

    print('\nEmail Processed\n\nSpam Classification: %d\n' % p);
    print('(1 indicates spam, 0 indicates not spam)\n\n');
Example #15
0
def processEmail(email_contents):
    '''
    word_indices = PROCESSEMAIL(email_contents) preprocesses 
    the body of an email and returns a list of indices of the 
    words contained in the email. 
    '''

    from getVocabList import getVocabList
    import re
    from nltk.stem.porter import PorterStemmer

    # Load Vocabulary
    vocabList = getVocabList()

    # ========================== Preprocess Email ===========================

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and
    # does not have any < or > in the tag and replace it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub(r'(http|https)://[^\s]+', 'httpaddr',
                            email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # Handle handle apostrophe
    email_contents = re.sub('[\']+', ' ', email_contents)

    # ========================== Tokenize Email ===========================

    # Tokenize and also get rid of any punctuation (any non alphanumeric characters)
    token_str = re.split(r'[\s]', email_contents)
    token_str = [re.sub('[^a-zA-Z0-9]', '', l)
                 for l in token_str]  # list comprehension

    # Remove empty strings from the list
    token_str = list(filter(None, token_str))

    # Output the email to screen as well
    print('\n==== Processed Email ====\n')
    print(token_str, '\n')

    # Stem the word using the Porter Stemming algorithm
    porter_stemmer = PorterStemmer()

    word_stem = []

    for word in token_str:
        word_stem.append(porter_stemmer.stem(word))

    # Look up the word in the dictionary and add the index
    # to word_indices if found

    word_indices = []

    for word in word_stem:
        if vocabList.get(word):  # if it exists
            word_indices.append(vocabList.get(word))

    return word_indices
Example #16
0
def processEmail(email_contents):
    #PROCESSEMAIL preprocesses a the body of an email and
    #returns a list of word_indices
    #   word_indices = PROCESSEMAIL(email_contents) preprocesses
    #   the body of an email and returns a list of indices of the
    #   words contained in the email.
    #

    # Load Vocabulary
    vocabList = getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = strfind(email_contents, ([char(10) char(10)]));
    # email_contents = email_contents(hdrstart(1):end);

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.compile('<[^<>]+>').sub(' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.compile('[0-9]+').sub(' number ', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.compile('(http|https)://[^\\s]*').sub(
        ' httpaddr ', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.compile('[^\\s]+@[^\\s]+').sub(' emailaddr ',
                                                       email_contents)

    # Handle $ sign
    email_contents = re.compile('[$]+').sub(' dollar ', email_contents)

    # Other
    email_contents = re.split('[ @$/#.-:&*+=\\[\\]?!(){},'
                              '\">_<;%\\n\\r]', email_contents)
    email_contents = [word for word in email_contents if len(word) > 0]

    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n')

    # Process file
    stemmer = PorterStemmer()
    processed_email = []
    for word in email_contents:
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)
        # Skip the word if it is too short
        if len(word) < 1:
            continue
        # Look up the word in the dictionary and add to word_indices if
        # found
        # ====================== YOUR CODE HERE ======================
        # Instructions: Fill in this function to add the index of str to
        #               word_indices if it is in the vocabulary. At this point
        #               of the code, you have a stemmed word from the email in
        #               the variable str. You should look up str in the
        #               vocabulary list (vocabList). If a match exists, you
        #               should add the index of the word to the word_indices
        #               vector. Concretely, if str = 'action', then you should
        #               look up the vocabulary list to find where in vocabList
        #               'action' appears. For example, if vocabList{18} =
        #               'action', then, you should add 18 to the word_indices
        #               vector (e.g., word_indices = [word_indices ; 18]; ).
        #
        # Note: vocabList{idx} returns a the word with index idx in the
        #       vocabulary list.
        #
        # Note: You can use strcmp(str1, str2) to compare two strings (str1 and
        #       str2). It will return 1 only if the two strings are equivalent.
        #
        try:
            index = vocabList.index(word)
        except ValueError:
            pass
        else:
            word_indices.append(index)
        # ============================================================"
    print(' '.join(processed_email))
    # Print footer
    print('\n\n=========================')
    return word_indices
def processEmail(email_contents):

    # Load Vocabulary
    vocabList = getVocabList()
    
    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub(r"<[^<>]+>", " ", email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub(r"[0-9]+", "number", email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub(r"(http|https)://[^\s]*", "httpaddr", email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub(r"[^\s]+@[^\s]+", "emailaddr", email_contents)

    # Handle $ sign
    email_contents = re.sub(r"[$]+", "dollar", email_contents)


    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n\n')

    # Process file
    l = 0

    strs = re.split(r'[ `\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?\n\t]', email_contents)

    p = PorterStemmer()
    for _str in strs:

        # Remove any non alphanumeric characters
        _str = re.sub(r"[^a-zA-Z0-9]", "", _str)

        # Stem the word 
        # (the porterStemmer sometimes has issues, so we use a try catch block)
        _str = p.stem(_str)
        
        # Skip the word if it is too short
        if len(_str) < 1:
            continue

        for i in range(len(vocabList)):
            if _str == vocabList[i]:
                word_indices.append(i)
                break

        if l + len(_str) + 1 > 78:
            print('')
            l = 0

        print(_str+" ", end='')

    # Print footer
    print('\n\n=========================\n')
    return word_indices
Example #18
0
print 'Test Accuracy: %f', np.mean(np.double(p == ytest.flatten())) * 100

## ================= Part 5: Top Predictors of Spam ====================
#  Since the model we are training is a linear SVM, we can inspect the
#  weights learned by the model to understand better how it is determining
#  whether an email is spam or not. The following code finds the words with
#  the highest weights in the classifier. Informally, the classifier
#  'thinks' that these words are the most likely indicators of spam.

# Sort the weights and obtain the vocabulary list

t = sorted(list(enumerate(model.coef_[0])), key=lambda e: e[1], reverse=True)
d = OrderedDict(t)
idx = d.keys()
weight = d.values()
vocabList = getVocabList()

print 'Top predictors of spam: '
for i in range(15):
    print ' %-15s (%f)' % (vocabList[idx[i]], weight[i])

print 'Program paused. Press enter to continue.'

## =================== Part 6: Try Your Own Emails =====================
#  Now that you've trained the spam classifier, you can use it on your own
#  emails! In the starter code, we have included spamSample1.txt,
#  spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
#  The following code reads in one of these emails and then uses your
#  learned SVM classifier to determine whether the email is Spam or
#  Not Spam
Example #19
0
def processEmail(email_contents):
    vocabList = getVocabList()
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # hdrstart = email_contents.find("\n\n")
    # if hdrstart:
    #     email_contents = email_contents[hdrstart:]
    
    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents)

    # Handle $ sign
    email_contents = re.sub('[$]+', 'dollar', email_contents)

    # Tokenize and get rid of any punctuation
#    [str, email_contents] = ...
#       strtok(email_contents, ...
#              [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
    email_contents = re.split(r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s]+', email_contents)
#    print(email_contents)

    # Output the email to screen as well
    #print('\n==== Processed Email ====\n\n')
    # Process file
    l = 0
    for token in email_contents:
        # Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token)
        # Stem the word 
        token = PorterStemmer().stem(token.strip())
        # Skip the word if it is too short
        if len(token) < 1:
           continue
        idx = vocabList[token] if token in vocabList else 0
        # only add entries which are in vocabList
        #   i.e. those with ind ~= 0, 
        #        given that ind is assigned 0 if str is not found in vocabList
        if idx > 0:
            word_indices.append(idx)
        # Print to screen, ensuring that the output lines are not too long
        if l + len(token) + 1 > 78:
            print("")
            l = 0
        print(token)
        l = l + len(token) + 1

    # Print footer
    #print('\n\n=========================\n')
    
    return word_indices