def processEmail(email_contents, verbose=True): vocabList = utils.getVocabList() word_indices = [] email_contents = email_contents.lower() email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) email_contents = re.compile('[0-9]+').sub(' number ', email_contents) email_contents = re.compile('(http|https)://[^\s]*').sub( ' httpaddr ', email_contents) email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents) email_contents = re.compile('[$]+').sub(' dollar ', email_contents) email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},' '">_<;%\n\r]', email_contents) email_contents = [word for word in email_contents if len(word) > 0] stemmer = utils.PorterStemmer() processed_email = [] for word in email_contents: word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() word = stemmer.stem(word) processed_email.append(word) if len(word) < 1: continue if word in vocabList: word_indices.append(vocabList.index(word)) return word_indices
def processEmail(email_contents, verbose=True): vocabList = utils.getVocabList() word_indices = [] email_contents = email_contents.lower() email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) email_contents = re.compile('[0-9]+').sub(' number ', email_contents) email_contents = re.compile('(http|https)://[^\s]*').sub( ' httpaddr ', email_contents) email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents) email_contents = re.compile('[$]+').sub(' dollar ', email_contents) email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},' '">_<;%\n\r]', email_contents) email_contents = [word for word in email_contents if len(word) > 0] stemmer = utils.PorterStemmer() processed_email = [] for word in email_contents: # Remove any remaining non alphanumeric characters in word word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() word = stemmer.stem(word) processed_email.append(word) if len(word) < 1: continue # Look up the word in the dictionary and add to word_indices if found # ====================== YOUR CODE HERE ====================== for i, item in enumerate(vocabList): if word == item: word_indices.append(i + 1) # ============================================================= if verbose: print('----------------') print('Processed email:') print('----------------') print(' '.join(processed_email)) return word_indices
def processEmail(email_contents, verbose=True): """ Preprocesses the body of an email and returns a list of indices of the words contained in the email. Parameters ---------- email_contents : str A string containing one email. verbose : bool If True, print the resulting email after processing. Returns ------- word_indices : list A list of integers containing the index of each word in the email which is also present in the vocabulary. Instructions ------------ Fill in this function to add the index of word to word_indices if it is in the vocabulary. At this point of the code, you have a stemmed word from the email in the variable word. You should look up word in the vocabulary list (vocabList). If a match exists, you should add the index of the word to the word_indices list. Concretely, if word = 'action', then you should look up the vocabulary list to find where in vocabList 'action' appears. For example, if vocabList[18] = 'action', then, you should add 18 to the word_indices vector (e.g., word_indices.append(18)). Notes ----- - vocabList[idx] returns a the word with index idx in the vocabulary list. - vocabList.index(word) return index of word `word` in the vocabulary list. (A ValueError exception is raised if the word does not exist.) """ # Load Vocabulary vocabList = utils.getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find(chr(10) + chr(10)) # email_contents = email_contents[hdrstart:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.compile('[0-9]+').sub(' number ', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.compile('(http|https)://[^\s]*').sub(' httpaddr ', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents) # Handle $ sign email_contents = re.compile('[$]+').sub(' dollar ', email_contents) # get rid of any punctuation email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},''">_<;%\n\r]', email_contents) # remove any empty word string email_contents = [word for word in email_contents if len(word) > 0] # Stem the email contents word by word stemmer = utils.PorterStemmer() processed_email = [] for word in email_contents: # Remove any remaining non alphanumeric characters in word word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() word = stemmer.stem(word) processed_email.append(word) if len(word) < 1: continue # Look up the word in the dictionary and add to word_indices if found # ====================== YOUR CODE HERE ====================== if word in vocabList: addition = vocabList.index(word) word_indices.append(addition) # ============================================================= if verbose: print('----------------') print('Processed email:') print('----------------') print(' '.join(processed_email)) return word_indices
# Load the test dataset # You will have Xtest, ytest in your environment data = loadmat(os.path.join('Data', 'spamTest.mat')) Xtest, ytest = data['Xtest'].astype(float), data['ytest'][:, 0] print('Evaluating the trained Linear SVM on a test set ...') p = utils.svmPredict(model, Xtest) print('Test Accuracy: %.2f' % (np.mean(p == ytest) * 100)) # Sort the weights and obtin the vocabulary list # NOTE some words have the same weights, # so their order might be different than in the text above idx = np.argsort(model['w']) top_idx = idx[-15:][::-1] vocabList = utils.getVocabList() print('Top predictors of spam:') print('%-15s %-15s' % ('word', 'weight')) print('----' + ' '*12 + '------') for word, w in zip(np.array(vocabList)[top_idx], model['w'][top_idx]): print('%-15s %0.2f' % (word, w)) # ---------------------- Trying own Emails (optional) ------------------------------------------ filename = os.path.join('Data', 'emailSample1.txt') with open(filename) as fid: file_contents = fid.read() word_indices = processEmail(file_contents, verbose=False)
def processEmail(email_contents, verbose=True): # Load Vocabulary vocabList = utils.getVocabList() #print(vocabList) # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find(chr(10) + chr(10)) # email_contents = email_contents[hdrstart:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.compile('[0-9]+').sub(' number ', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.compile('(http|https)://[^\s]*').sub( ' httpaddr ', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents) # Handle $ sign email_contents = re.compile('[$]+').sub(' dollar ', email_contents) # get rid of any punctuation email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},' '">_<;%\n\r]', email_contents) # remove any empty word string email_contents = [word for word in email_contents if len(word) > 0] # Stem the email contents word by word stemmer = utils.PorterStemmer() processed_email = [] for word in email_contents: # Remove any remaining non alphanumeric characters in word word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() word = stemmer.stem(word) processed_email.append(word) if len(word) < 1: continue # Look up the word in the dictionary and add to word_indices if found # ====================== YOUR CODE HERE ====================== try: index = vocabList.index(word) word_indices.append(index) except ValueError: pass # ============================================================= if verbose: print('----------------') print('Processed email:') print('----------------') print(' '.join(processed_email)) return word_indices
def processEmail(email_contents, verbose=True): # load vocabulary vocabList = utils.getVocabList() # init return value word_indices = set() # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > # and does not have any < or > in the tag it with a space email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.compile('[0-9]+').sub(' number ', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.compile('(http|https)://[^\s]*').sub( ' httpaddr ', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.compile('[^\s]+@[^\s]+').sub( ' emailaddr ', email_contents) # Handle $ sign email_contents = re.compile('[$]+').sub(' dollar ', email_contents) # get rid of any punctuation email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},' '">_<;%\n\r]', email_contents) # remove any empty word string email_contents = [word for word in email_contents if len(word) > 0] # Stem the email contents word by word stemmer = utils.PorterStemmer() processed_email = [] for word in email_contents: # Remove any remaining non alphanumeric characters in word word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() # reduce words to their core word = stemmer.stem(word) processed_email.append(word) if len(word) < 1: continue try: index = vocabList.index(word) word_indices.add(index) except ValueError: pass if verbose: print('----------------') print('Processed email:') print('----------------') print(' '.join(processed_email)) return list(word_indices)