def text_cleaner(text, deep_clean=False, stem=True, stop_words=True, translite_rate=True): rules = [ { r'>\s+': u'>' }, # remove spaces after a tag opens or closes { r'\s+': u' ' }, # replace consecutive spaces { r'\s*<br\s*/?>\s*': u'\n' }, # newline after a <br> { r'</(div)\s*>\s*': u'\n' }, # newline after </p> and </div> and <h1/>... { r'</(p|h\d)\s*>\s*': u'\n\n' }, # newline after </p> and </div> and <h1/>... { r'<head>.*<\s*(/head|body)[^>]*>': u'' }, # remove <head> to </head> { r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1' }, # show links instead of texts { r'[ \t]*<[^<]*?/?>': u'' }, # remove remaining tags { r'^\s+': u'' } # remove spaces at the beginning ] if deep_clean: text = text.replace(".", "") text = text.replace("[", " ") text = text.replace(",", " ") text = text.replace("]", " ") text = text.replace("(", " ") text = text.replace(")", " ") text = text.replace("\"", "") text = text.replace("-", " ") text = text.replace("=", " ") text = text.replace("?", " ") text = text.replace("!", " ") for rule in rules: for (k, v) in rule.items(): regex = re.compile(k) text = regex.sub(v, text) text = text.rstrip() text = text.strip() text = text.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ') text = re.sub("(^|\W)\d+($|\W)", " ", text) if translite_rate: text = transliterate(text) if stem: text = PorterStemmer().stem(text) text = WordNetLemmatizer().lemmatize(text) if stop_words: stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) text = [w for w in word_tokens if not w in stop_words] text = ' '.join(str(e) for e in text) else: for rule in rules: for (k, v) in rule.items(): regex = re.compile(k) text = regex.sub(v, text) text = text.rstrip() text = text.strip() return text.lower()
def processEmail(email_contents): vocabList = getVocabList() word_indices = [] # ========================== Preprocess Email =========================== # Find the Headers ( \n\n and remove ) # Uncomment the following lines if you are working with raw emails with the # full headers # hdrstart = email_contents.find("\n\n") # if hdrstart: # email_contents = email_contents[hdrstart:] # Lower case email_contents = email_contents.lower() # Strip all HTML # Looks for any expression that starts with < and ends with > and replace # and does not have any < or > in the tag it with a space email_contents = re.sub('<[^<>]+>', ' ', email_contents) # Handle Numbers # Look for one or more characters between 0-9 email_contents = re.sub('[0-9]+', 'number', email_contents) # Handle URLS # Look for strings starting with http:// or https:// email_contents = re.sub('(http|https)://[^\s]*', 'httpaddr', email_contents) # Handle Email Addresses # Look for strings with @ in the middle email_contents = re.sub('[^\s]+@[^\s]+', 'emailaddr', email_contents) # Handle $ sign email_contents = re.sub('[$]+', 'dollar', email_contents) # Tokenize and get rid of any punctuation # [str, email_contents] = ... # strtok(email_contents, ... # [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]); email_contents = re.split(r'[@$/#.-:&\*\+=\[\]?!(){},\'\'\">_<;%\s]+', email_contents) # print(email_contents) # Output the email to screen as well #print('\n==== Processed Email ====\n\n') # Process file l = 0 for token in email_contents: # Remove any non alphanumeric characters token = re.sub('[^a-zA-Z0-9]', '', token) # Stem the word token = PorterStemmer().stem(token.strip()) # Skip the word if it is too short if len(token) < 1: continue idx = vocabList[token] if token in vocabList else 0 # only add entries which are in vocabList # i.e. those with ind ~= 0, # given that ind is assigned 0 if str is not found in vocabList if idx > 0: word_indices.append(idx) # Print to screen, ensuring that the output lines are not too long if l + len(token) + 1 > 78: print("") l = 0 print(token) l = l + len(token) + 1 # Print footer #print('\n\n=========================\n') return word_indices