def tokenize(text): '''clean and tokenize input messages''' # replace urls with placeholder url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' detected_urls = re.findall(url_regex, text) for url in detected_urls: text = text.replace(url, "urlplaceholder") # tokenize text tokens = word_tokenize(text) # process text further in loop clean_tokens = [] for tok in tokens: # Remove stop words if tok in stopwords.words("english"): continue # Reduce words to their stems tok = PorterStemmer().stem(tok) # Reduce words to their root form lemmatizer = WordNetLemmatizer() tok = lemmatizer.lemmatize(tok).lower().strip() # append to list clean_tokens.append(tok) # Remove all non alphabet characters clean_tokens = [tok for tok in clean_tokens if tok.isalpha()] # return clean and tokenized text return clean_tokens
def tokenize(text): '''tokenize input messages''' tokens = word_tokenize(text) lemmatizer = WordNetLemmatizer() clean_tokens = [] for tok in tokens: # Remove stop words if tok in stopwords.words("english"): continue # Reduce words to their stems tok = PorterStemmer().stem(tok) # Reduce words to their root form tok = lemmatizer.lemmatize(tok).lower().strip() clean_tokens.append(tok) # Remove all non alphabet characters clean_tokens = [tok for tok in clean_tokens if tok.isalpha()] return clean_tokens