def process_email(email_contents): voca_list = get_vacabulary_list('./vocab.txt') word_indices = list() email_contents = email_contents.lower() email_contents = re.sub(r'<[^<>]+>', ' ', email_contents) email_contents = re.sub(r'[0-9]+', ' ', email_contents) email_contents = re.sub(r'(http|https)://[^\s]*', 'httpaddr', email_contents) email_contents = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', email_contents) email_contents = re.sub(r'[$]+', 'dollar', email_contents) print('\n==== Processed Email ====\n\n') # python和matlab正则不同,有些符号需要加上转义 email_contents = re.split( r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s\n\r\t]+', email_contents) for vocabulary in email_contents: vocabulary = re.sub(r'[^a-zA-Z0-9]', '', vocabulary) vocabulary = PorterStemmer().stem(vocabulary.strip()) if len(vocabulary) <= 1: continue # 将对应单词的位置加入list if vocabulary in voca_list: index = voca_list[vocabulary] word_indices.append(index) else: index = 0 print('\n\n=========================\n') return word_indices
def train_arrays(headlines, tag): threshold = analyse.find_median(analyse.make_list_num(headlines, tag)) full_text = [] classes = [] for headline in headlines: text = headline.getElementsByTagName( "text")[0].childNodes[0].data.encode('utf-8') value = int(headline.getElementsByTagName(tag)[0].childNodes[0].data) if value <= threshold: classes.append(0) else: classes.append(1) #get rif of all punctuation for char in string.punctuation: text = text.replace(char, '') for w in text.split(): w = PorterStemmer().stem_word(w) full_text.append(w.strip().lower()) noDub = make_no_dub_list(full_text) #noDub=make_no_dub_list(full_text, lambda x: x.lower()) all_vektors = make_array(noDub, headlines) #print len(all_vektors), " , " , len(classes) return all_vektors, classes, noDub
def make_array(noDub, headlines): all_vektors = [] for headline in headlines: headline_vektor = [0] * len(noDub) text = headline.getElementsByTagName( "text")[0].childNodes[0].data.encode('utf-8') text = text.split() for w in text: for char in string.punctuation: w = w.replace(char, '') w = PorterStemmer().stem_word(w) w = w.strip().lower() if w in noDub: headline_vektor[noDub.index(w)] = 1 all_vektors.append(headline_vektor) return all_vektors
def processEmail(self, email_contents): """ PROCESSEMAIL- preprocesses the body of an email and returns a list of word_indices word_indices = PROCESSEMAIL(email_contents) preprocesses the body of an email and returns a list of indicies of the words contained in the email. """ # get Vocabulary vocabList = self.getVocabList() word_indices = [] # ========================= Preprocess Email ========================================== """ Find the Headers (\n\n and remove) Uncomment the following lines if you are working with raw emails with the full headers """ # hdstart = email_contents # if hdstart: # email_contents = email_contents[hdstart:] # All to lower case email_contents = email_contents.lower() """ Strip all HTML Looks for any expression that starts with < and end with (it doesn't have any < or > in the tag)> and replace it on whitespace """ email_contents = re.sub('<[^<>]+>', ' ', email_contents) """ Handle numbers (Normilizing numbers) Look for one or more charecters between 0-9 """ email_contents = re.sub('[0-9]+', "number", email_contents) """ Handle URLs (Normilizing URLs) Look for strings starting with http:// or https:// and repalce on httpaddr """ email_contents = re.sub('(http|https)://[^\s]*', "httpaddr", email_contents) """ Handle Email Address (Normilizing Email Addresses) Look for strings with @ in the middle """ email_contents = re.sub('[^s\s]+@[^\s]+', "emailaddr", email_contents) """ Handle $ sign (Normilizing Dollars) """ email_contents = re.sub('[$]+', "dollar", email_contents) # ============================ Tokenize Email ===================================== print("===== Email Processed =====") l = 0 # Slightly different order from matlab version # Split and also get rid of any punctuation # regex may need further debugging... #.∧_∧ #( ・ω・。)つ━☆・*。 #⊂ ノ ・゜+. #しーJ °。+ *´¨) # .· ´¸.·*´¨) # (¸.·´ (¸.·'* Wow such magic (no) email_contents = re.split( r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s\n\r\t]+', email_contents) for token in email_contents: # # Remove any non alphanumeric characters token = re.sub("[^a-zA-Z0-9]", '', token) # Stem the word token = PorterStemmer().stem(token.strip()) # token = PorterStemmer().stem_word(token.strip()) if len(token) < 1: continue indx = self.vocabList[token] if token in self.vocabList else 0 if indx > 0: word_indices.append(indx) # print(token) print("\n\n================================\n") return (word_indices)
def process_email(email_contents: str) -> List[int]: """Pre-process the body of an email and return a list of indices of the words contained in the email. :param email_contents: the body of an email :return: a list of indices of the words contained in the email """ # Load the vocabulary. vocabulary_dict = get_vocabulary_dict() # Initialize the return value. word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # header_token = '\n\n' # header_start = email_contents.find(header_token) # email_contents = email_contents[header_start+len(header_token):] # Convert email content to lower case. email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle numbers. # Convert all sequences of digits (0-9) to a 'number' token. email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLs. # Convert all strings starting with http:// or https:// to a 'httpaddr' token. email_contents = re.sub('(http://|https://)+\S*', 'httpaddr', email_contents) # Handle email addresses. # Convert all strings with @ in the middle to a 'emailaddr' token. email_contents = re.sub('[\S*]+(@)+\S*', 'emailaddr', email_contents) # Handle $ sign # Convert all sequences of $ signs to a 'dollar' token. email_contents = re.sub('[$]', 'dollar', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n\n') # Process file col = 0 # Tokenize and also get rid of any punctuation tokens = re.split('[ @$/#.-:&*\+=\[\]?!\(\)\{\},' '">_<;#\n\r]', email_contents) for token in tokens: # Remove any non alphanumeric characters token = re.sub('[^a-zA-Z0-9]', '', token) # Stem the word token = PorterStemmer().stem(token.strip()) # Skip the word if it is too short if len(token) < 1: continue # Look up the word in the dictionary and add to word_indices if # found for i, word in vocabulary_dict.items(): if token == word: word_indices.append(i) # Print to screen, ensuring that the output lines are not too long if (col + len(token) + 1) > 78: print('') col = 0 print('{} '.format(token), end='', flush=True) col = col + len(token) + 1 # Print footer print('\n\n=========================\n') return word_indices
def processEmail(email_contents): #PROCESSEMAIL preprocesses a the body of an email and #returns a list of word_indices # word_indices = PROCESSEMAIL(email_contents) preprocesses # the body of an email and returns a list of indices of the # words contained in the email. # # Load Vocabulary vocabList = gvl.getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find("\n\n") # if hdrstart: # email_contents = email_contents[hdrstart:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n\n') # Process file l = 0 # Slightly different order from matlab version # Split and also get rid of any punctuation # regex may need further debugging... email_contents = re.split(r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s\n\r\t]+', email_contents) for token in email_contents: # Remove any non alphanumeric characters token = re.sub('[^a-zA-Z0-9]', '', token) # Stem the word token = PorterStemmer().stem_word(token.strip()) # Skip the word if it is too short if len(token) < 1: continue # Look up the word in the dictionary and add to word_indices if # found # ====================== YOUR CODE HERE ====================== # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. At this point # of the code, you have a stemmed word from the email in # the variable str. You should look up str in the # vocabulary list (vocabList). If a match exists, you # should add the index of the word to the word_indices # vector. Concretely, if str = 'action', then you should # look up the vocabulary list to find where in vocabList # 'action' appears. For example, if vocabList{18} = # 'action', then, you should add 18 to the word_indices # vector (e.g., word_indices = [word_indices ; 18]; ). # # Note: vocabList{idx} returns a the word with index idx in the # vocabulary list. # # Note: You can use strcmp(str1, str2) to compare two strings (str1 and # str2). It will return 1 only if the two strings are equivalent. # idx = vocabList[token] if token in vocabList else 0 # only add entries which are in vocabList # i.e. those with ind ~= 0, # given that ind is assigned 0 if str is not found in vocabList if idx > 0: word_indices.append(idx) # ============================================================= # Print to screen, ensuring that the output lines are not too long if l + len(token) + 1 > 78: print("") l = 0 print('{:s}'.format(token)), l = l + len(token) + 1 # Print footer print('\n\n=========================\n') return word_indices
def process_email(email_contents: str) -> List[int]: """Pre-process the body of an email and return a list of indices of the words contained in the email. :param email_contents: the body of an email :return: a list of indices of the words contained in the email """ vocabulary_dict = get_vocabulary_dict() # FIXME: Initialize the return value. word_indices = list() # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # header_token = '\n\n' # header_start = email_contents.find(header_token) # email_contents = email_contents[header_start+len(header_token):] # FIXME: Convert email content to lower case. email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # FIXME: Handle numbers. # Convert all sequences of digits (0-9) to a 'number' token. email_contents = re.sub('\d+', 'number', email_contents) # FIXME: Handle URLs. # Convert all strings starting with http:// or https:// to a 'httpaddr' token. email_contents = re.sub('(http://|https://)[\w\.-]+', 'httpaddr', email_contents) # FIXME: Handle email addresses. # Convert all strings with @ in the middle to a 'emailaddr' token. email_contents = re.sub('[\w\.-]+@[\w\.-]+', 'emailaddr', email_contents) # FIXME: Handle $ sign # Convert all sequences of $ signs to a 'dollar' token. email_contents = re.sub('\$+', 'dollar', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n\n') # Process file col = 0 # Tokenize and also get rid of any punctuation tokens = re.split('[ @$/#.-:&*\+=\[\]?!\(\)\{\},''">_<;#\n\r]', email_contents) for token in tokens: # Remove any non alphanumeric characters token = re.sub('[^a-zA-Z0-9]', '', token) # Stem the word token = PorterStemmer().stem(token.strip()) # Skip the word if it is too short if len(token) < 1: continue # Look up the word in the dictionary and add to word_indices if # found # FIXME: ======================= YOUR CODE HERE ====================== # Instructions: Fill in this function to add the index of str to # word_indices if it is in the vocabulary. At this point # of the code, you have a stemmed word from the email in # the variable str. You should look up str in the # vocabulary list (vocabulary_dict). If a match exists, you # should add the index of the word to the word_indices # vector. Concretely, if str = 'action', then you should # look up the vocabulary list to find where in vocabulary_dict # 'action' appears. For example, if vocabulary_dict{18} = # 'action', then, you should add 18 to the word_indices # vector (e.g., word_indices = [word_indices ; 18]; ). # # Note: vocabulary_dict{idx} returns a the word with index idx in the # vocabulary list. # # Note: You can use strcmp(str1, str2) to compare two strings (str1 and # str2). It will return 1 only if the two strings are equivalent. # if token in vocabulary_dict.values(): index = get_key(token, vocabulary_dict) if index == 0: raise ValueError else: word_indices.append(index) # ========================= END OF YOUR CODE ========================== # Print to screen, ensuring that the output lines are not too long if (col + len(token) + 1) > 78: print('') col = 0 print('{} '.format(token), end='', flush=True) col = col + len(tokens) + 1 # Print footer print('\n\n=========================\n') return word_indices
def processEmail(email_contents): # word_indices = PROCESSEMAIL(email_contents) preprocesses # the body of an email and returns a list of indices of the # words contained in the email. # Load Vocabulary vocabList = getVocabList() # Init return value word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find("\n\n") # if hdrstart: # email_contents = email_contents[hdrstart:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # ========================== Tokenize Email =========================== # Output the email to screen as well print('\n==== Processed Email ====\n\n') # Process file l = 0 # Slightly different order from matlab version # Split and also get rid of any punctuation # regex may need further debugging... email_contents = re.split( r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s\n\r\t]+', email_contents) for token in email_contents: # Remove any non alphanumeric characters token = re.sub('[^a-zA-Z0-9]', '', token) # Stem the word token = PorterStemmer().stem(token.strip()) # Skip the word if it is too short if len(token) < 1: continue idx = vocabList[token] if token in vocabList else 0 # only add entries which are in vocabList # i.e. those with ind ~= 0, # given that ind is assigned 0 if str is not found in vocabList if idx > 0: word_indices.append(idx) # Print to screen, ensuring that the output lines are not too long if l + len(token) + 1 > 78: print("") l = 0 print('{:s}'.format(token)), l = l + len(token) + 1 print('\n\n=========================\n') return word_indices