Beispiel #1
0
def processEmail(email_contents, verbose=True):
    vocabList = utils.getVocabList()
    word_indices = []
    email_contents = email_contents.lower()
    email_contents = re.compile('<[^<>]+>').sub(' ', email_contents)
    email_contents = re.compile('[0-9]+').sub(' number ', email_contents)
    email_contents = re.compile('(http|https)://[^\s]*').sub(
        ' httpaddr ', email_contents)
    email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ',
                                                     email_contents)
    email_contents = re.compile('[$]+').sub(' dollar ', email_contents)
    email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},'
                              '">_<;%\n\r]', email_contents)
    email_contents = [word for word in email_contents if len(word) > 0]
    stemmer = utils.PorterStemmer()
    processed_email = []
    for word in email_contents:
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)
        if len(word) < 1:
            continue
        if word in vocabList:
            word_indices.append(vocabList.index(word))
    return word_indices
Beispiel #2
0
def processEmail(email_contents, verbose=True):
    vocabList = utils.getVocabList()
    word_indices = []

    email_contents = email_contents.lower()
    email_contents = re.compile('<[^<>]+>').sub(' ', email_contents)
    email_contents = re.compile('[0-9]+').sub(' number ', email_contents)
    email_contents = re.compile('(http|https)://[^\s]*').sub(
        ' httpaddr ', email_contents)
    email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ',
                                                     email_contents)
    email_contents = re.compile('[$]+').sub(' dollar ', email_contents)
    email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},'
                              '">_<;%\n\r]', email_contents)
    email_contents = [word for word in email_contents if len(word) > 0]
    stemmer = utils.PorterStemmer()
    processed_email = []
    for word in email_contents:
        # Remove any remaining non alphanumeric characters in word
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)

        if len(word) < 1:
            continue

        # Look up the word in the dictionary and add to word_indices if found
        # ====================== YOUR CODE HERE ======================
        for i, item in enumerate(vocabList):
            if word == item:
                word_indices.append(i + 1)
        # =============================================================
    if verbose:
        print('----------------')
        print('Processed email:')
        print('----------------')
        print(' '.join(processed_email))
    return word_indices
Beispiel #3
0
def processEmail(email_contents, verbose=True):
    """
    Preprocesses the body of an email and returns a list of indices
    of the words contained in the email.

    Parameters
    ----------
    email_contents : str
        A string containing one email.

    verbose : bool
        If True, print the resulting email after processing.

    Returns
    -------
    word_indices : list
        A list of integers containing the index of each word in the
        email which is also present in the vocabulary.

    Instructions
    ------------
    Fill in this function to add the index of word to word_indices
    if it is in the vocabulary. At this point of the code, you have
    a stemmed word from the email in the variable word.
    You should look up word in the vocabulary list (vocabList).
    If a match exists, you should add the index of the word to the word_indices
    list. Concretely, if word = 'action', then you should
    look up the vocabulary list to find where in vocabList
    'action' appears. For example, if vocabList[18] =
    'action', then, you should add 18 to the word_indices
    vector (e.g., word_indices.append(18)).

    Notes
    -----
    - vocabList[idx] returns a the word with index idx in the vocabulary list.

    - vocabList.index(word) return index of word `word` in the vocabulary list.
      (A ValueError exception is raised if the word does not exist.)
    """
    # Load Vocabulary
    vocabList = utils.getVocabList()

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================
    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers
    # hdrstart = email_contents.find(chr(10) + chr(10))
    # email_contents = email_contents[hdrstart:]

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.compile('<[^<>]+>').sub(' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.compile('[0-9]+').sub(' number ', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.compile('(http|https)://[^\s]*').sub(' httpaddr ', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents)

    # Handle $ sign
    email_contents = re.compile('[$]+').sub(' dollar ', email_contents)

    # get rid of any punctuation
    email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},''">_<;%\n\r]', email_contents)

    # remove any empty word string
    email_contents = [word for word in email_contents if len(word) > 0]

    # Stem the email contents word by word
    stemmer = utils.PorterStemmer()
    processed_email = []
    for word in email_contents:
        # Remove any remaining non alphanumeric characters in word
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)

        if len(word) < 1:
            continue

        # Look up the word in the dictionary and add to word_indices if found
        # ====================== YOUR CODE HERE ======================

        if word in vocabList:
            addition = vocabList.index(word)
            word_indices.append(addition)

        # =============================================================

    if verbose:
        print('----------------')
        print('Processed email:')
        print('----------------')
        print(' '.join(processed_email))
    return word_indices
Beispiel #4
0
# Load the test dataset
# You will have Xtest, ytest in your environment
data = loadmat(os.path.join('Data', 'spamTest.mat'))
Xtest, ytest = data['Xtest'].astype(float), data['ytest'][:, 0]

print('Evaluating the trained Linear SVM on a test set ...')
p = utils.svmPredict(model, Xtest)

print('Test Accuracy: %.2f' % (np.mean(p == ytest) * 100))

# Sort the weights and obtin the vocabulary list
# NOTE some words have the same weights,
# so their order might be different than in the text above
idx = np.argsort(model['w'])
top_idx = idx[-15:][::-1]
vocabList = utils.getVocabList()

print('Top predictors of spam:')
print('%-15s %-15s' % ('word', 'weight'))
print('----' + ' '*12 + '------')
for word, w in zip(np.array(vocabList)[top_idx], model['w'][top_idx]):
    print('%-15s %0.2f' % (word, w))

# ---------------------- Trying own Emails (optional) ------------------------------------------

filename = os.path.join('Data', 'emailSample1.txt')

with open(filename) as fid:
    file_contents = fid.read()

word_indices = processEmail(file_contents, verbose=False)
Beispiel #5
0
def processEmail(email_contents, verbose=True):
    # Load Vocabulary
    vocabList = utils.getVocabList()
    #print(vocabList)

    # Init return value
    word_indices = []

    # ========================== Preprocess Email ===========================
    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers
    # hdrstart = email_contents.find(chr(10) + chr(10))
    # email_contents = email_contents[hdrstart:]

    # Lower case
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.compile('<[^<>]+>').sub(' ', email_contents)

    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.compile('[0-9]+').sub(' number ', email_contents)

    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.compile('(http|https)://[^\s]*').sub(
        ' httpaddr ', email_contents)

    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ',
                                                     email_contents)

    # Handle $ sign
    email_contents = re.compile('[$]+').sub(' dollar ', email_contents)

    # get rid of any punctuation
    email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},'
                              '">_<;%\n\r]', email_contents)

    # remove any empty word string
    email_contents = [word for word in email_contents if len(word) > 0]

    # Stem the email contents word by word
    stemmer = utils.PorterStemmer()
    processed_email = []
    for word in email_contents:
        # Remove any remaining non alphanumeric characters in word
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)

        if len(word) < 1:
            continue

        # Look up the word in the dictionary and add to word_indices if found
        # ====================== YOUR CODE HERE ======================

        try:
            index = vocabList.index(word)
            word_indices.append(index)
        except ValueError:
            pass

        # =============================================================

    if verbose:
        print('----------------')
        print('Processed email:')
        print('----------------')
        print(' '.join(processed_email))
    return word_indices
    def processEmail(email_contents, verbose=True):

        # load vocabulary
        vocabList = utils.getVocabList()

        # init return value
        word_indices = set()

        # Lower case
        email_contents = email_contents.lower()

        # Strip all HTML
        # Looks for any expression that starts with < and ends with >
        # and does not have any < or > in the tag it with a space
        email_contents = re.compile('<[^<>]+>').sub(' ', email_contents)

        # Handle Numbers
        # Look for one or more characters between 0-9
        email_contents = re.compile('[0-9]+').sub(' number ', email_contents)

        # Handle URLS
        # Look for strings starting with http:// or https://
        email_contents = re.compile('(http|https)://[^\s]*').sub(
            ' httpaddr ', email_contents)

        # Handle Email Addresses
        # Look for strings with @ in the middle
        email_contents = re.compile('[^\s]+@[^\s]+').sub(
            ' emailaddr ', email_contents)

        # Handle $ sign
        email_contents = re.compile('[$]+').sub(' dollar ', email_contents)

        # get rid of any punctuation
        email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},'
                                  '">_<;%\n\r]', email_contents)

        # remove any empty word string
        email_contents = [word for word in email_contents if len(word) > 0]

        # Stem the email contents word by word
        stemmer = utils.PorterStemmer()

        processed_email = []

        for word in email_contents:

            # Remove any remaining non alphanumeric characters in word
            word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()

            # reduce words to their core
            word = stemmer.stem(word)

            processed_email.append(word)

            if len(word) < 1:
                continue

            try:

                index = vocabList.index(word)

                word_indices.add(index)

            except ValueError:

                pass

        if verbose:

            print('----------------')

            print('Processed email:')

            print('----------------')

            print(' '.join(processed_email))

        return list(word_indices)