def preprocess(doc, max_word_length=None, min_word_length=None, stopwords="short", stem=False): stop = pre.stopwords if stopwords == "short" else pre.stopwords_long # Clean text = pre.clean(doc.pure_text) # Remove stopwords text = pre.filter_by_list(text, stop) # Remove words of certain length if max_word_length and min_word_length: text = pre.filter_by_length(text, max_length=max_word_length, min_length=min_word_length) else: text = pre.filter_by_length(text) # Stem if stem: text = pre.stem(text) # Now replace text with processed text doc.get_words(text)
def preprocess(tweet): tokens = remove_stopwords(tokenize(clean(tweet)), stopwords) fdist = FreqDist(tokens) return np.array([[fdist[f] for f in features]])
def clean_up(text): text = pre.clean(text) text = pre.filter_by_list(text, pre.stopwords_long) text = pre.remove_non_ascii(text) return text