def processEmail(email_contents, verbose=True): vocabList = utils.getVocabList() word_indices = [] email_contents = email_contents.lower() email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) email_contents = re.compile('[0-9]+').sub(' number ', email_contents) email_contents = re.compile('(http|https)://[^\s]*').sub( ' httpaddr ', email_contents) email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents) email_contents = re.compile('[$]+').sub(' dollar ', email_contents) email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},' '">_<;%\n\r]', email_contents) email_contents = [word for word in email_contents if len(word) > 0] stemmer = utils.PorterStemmer() processed_email = [] for word in email_contents: word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() word = stemmer.stem(word) processed_email.append(word) if len(word) < 1: continue if word in vocabList: word_indices.append(vocabList.index(word)) return word_indices
def processEmail(email_contents, verbose=True): vocabList = utils.getVocabList() word_indices = [] email_contents = email_contents.lower() email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) email_contents = re.compile('[0-9]+').sub(' number ', email_contents) email_contents = re.compile('(http|https)://[^\s]*').sub( ' httpaddr ', email_contents) email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents) email_contents = re.compile('[$]+').sub(' dollar ', email_contents) email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},' '">_<;%\n\r]', email_contents) email_contents = [word for word in email_contents if len(word) > 0] stemmer = utils.PorterStemmer() processed_email = [] for word in email_contents: # Remove any remaining non alphanumeric characters in word word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() word = stemmer.stem(word) processed_email.append(word) if len(word) < 1: continue # Look up the word in the dictionary and add to word_indices if found # ====================== YOUR CODE HERE ====================== for i, item in enumerate(vocabList): if word == item: word_indices.append(i + 1) # ============================================================= if verbose: print('----------------') print('Processed email:') print('----------------') print(' '.join(processed_email)) return word_indices
def processEmail(email_contents, verbose=True): """ Preprocesses the body of an email and returns a list of indices of the words contained in the email. Parameters ---------- email_contents : str A string containing one email. verbose : bool If True, print the resulting email after processing. Returns ------- word_indices : list A list of integers containing the index of each word in the email which is also present in the vocabulary. Instructions ------------ Fill in this function to add the index of word to word_indices if it is in the vocabulary. At this point of the code, you have a stemmed word from the email in the variable word. You should look up word in the vocabulary list (vocabList). If a match exists, you should add the index of the word to the word_indices list. Concretely, if word = 'action', then you should look up the vocabulary list to find where in vocabList 'action' appears. For example, if vocabList[18] = 'action', then, you should add 18 to the word_indices vector (e.g., word_indices.append(18)). Notes ----- - vocabList[idx] returns a the word with index idx in the vocabulary list. - vocabList.index(word) return index of word `word` in the vocabulary list. (A ValueError exception is raised if the word does not exist.) """ # Load Vocabulary vocabList = utils.getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find(chr(10) + chr(10)) # email_contents = email_contents[hdrstart:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.compile('[0-9]+').sub(' number ', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.compile('(http|https)://[^\s]*').sub(' httpaddr ', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents) # Handle $ sign email_contents = re.compile('[$]+').sub(' dollar ', email_contents) # get rid of any punctuation email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},''">_<;%\n\r]', email_contents) # remove any empty word string email_contents = [word for word in email_contents if len(word) > 0] # Stem the email contents word by word stemmer = utils.PorterStemmer() processed_email = [] for word in email_contents: # Remove any remaining non alphanumeric characters in word word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() word = stemmer.stem(word) processed_email.append(word) if len(word) < 1: continue # Look up the word in the dictionary and add to word_indices if found # ====================== YOUR CODE HERE ====================== if word in vocabList: addition = vocabList.index(word) word_indices.append(addition) # ============================================================= if verbose: print('----------------') print('Processed email:') print('----------------') print(' '.join(processed_email)) return word_indices
def processEmail(email_contents, verbose=True): # Load Vocabulary vocabList = utils.getVocabList() #print(vocabList) # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find(chr(10) + chr(10)) # email_contents = email_contents[hdrstart:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.compile('[0-9]+').sub(' number ', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.compile('(http|https)://[^\s]*').sub( ' httpaddr ', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents) # Handle $ sign email_contents = re.compile('[$]+').sub(' dollar ', email_contents) # get rid of any punctuation email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},' '">_<;%\n\r]', email_contents) # remove any empty word string email_contents = [word for word in email_contents if len(word) > 0] # Stem the email contents word by word stemmer = utils.PorterStemmer() processed_email = [] for word in email_contents: # Remove any remaining non alphanumeric characters in word word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() word = stemmer.stem(word) processed_email.append(word) if len(word) < 1: continue # Look up the word in the dictionary and add to word_indices if found # ====================== YOUR CODE HERE ====================== try: index = vocabList.index(word) word_indices.append(index) except ValueError: pass # ============================================================= if verbose: print('----------------') print('Processed email:') print('----------------') print(' '.join(processed_email)) return word_indices
def processEmail(email_contents, verbose=True): # load vocabulary vocabList = utils.getVocabList() # init return value word_indices = set() # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > # and does not have any < or > in the tag it with a space email_contents = re.compile('<[^<>]+>').sub(' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.compile('[0-9]+').sub(' number ', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.compile('(http|https)://[^\s]*').sub( ' httpaddr ', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.compile('[^\s]+@[^\s]+').sub( ' emailaddr ', email_contents) # Handle $ sign email_contents = re.compile('[$]+').sub(' dollar ', email_contents) # get rid of any punctuation email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},' '">_<;%\n\r]', email_contents) # remove any empty word string email_contents = [word for word in email_contents if len(word) > 0] # Stem the email contents word by word stemmer = utils.PorterStemmer() processed_email = [] for word in email_contents: # Remove any remaining non alphanumeric characters in word word = re.compile('[^a-zA-Z0-9]').sub('', word).strip() # reduce words to their core word = stemmer.stem(word) processed_email.append(word) if len(word) < 1: continue try: index = vocabList.index(word) word_indices.add(index) except ValueError: pass if verbose: print('----------------') print('Processed email:') print('----------------') print(' '.join(processed_email)) return list(word_indices)