def preprocess(self, text, min_len=2, max_len=240, remove_common=False): ''' Function to remove stop words and perform lemmatization. INPUT: - text (str): numpy array of tweet text. - min_len (int): words with less characters than the value of min_len will be removed. - max_len (int): words with more character than the value of max_len will be removed. - remove_common (bool): add common words in the corpus to the stopwords list. OUTPUT: cleaned string ''' result = [] stopwords = STOPWORDS.copy() stopwords = set(stopwords) spanish = self._get_spanish_stopwords() stopwords.update(spanish) stopwords.update(['http', 'f**k', 'rt']) if remove_common: stopwords.update(['google', 'apple', 'twitter', 'microsoft']) for token in gensim.utils.simple_preprocess(text, min_len=min_len, max_len=max_len): if token not in stopwords: result.append(self._lemmatize_stemming(token)) return result
def _preprocess(self, text, min_len=2, max_len=240, custom_stopwords=False): result = [] if custom_stopwords: stopwords = STOPWORDS.copy() stopwords = set(stopwords) spanish = self._get_spanish_stopwords() custom = self._get_custom_stopwords() stopwords.update(spanish) # stopwords.update(['http', 'f**k', 'rt']) stopwords.update(custom) else: stopwords = STOPWORDS.copy() for token in gensim.utils.simple_preprocess(text, min_len=min_len, max_len=max_len): if token not in stopwords: result.append(self._lemmatize_stemming(token)) return result
def tokenize_text(self, text): tokens = [] # Adding to stopwords stopwords = STOPWORDS.copy() stopwords = set(stopwords) spanish = self._get_spanish_stopwords() stopwords.update(spanish) stopwords.update(['http', 'f**k', 'rt']) for sent in nltk.sent_tokenize(text): for word in nltk.word_tokenize(sent): # if word not in stopwords: if len(word) < 2: continue # tokens.append(self._lemmatize_stemming(word.lower())) tokens.append(word.lower()) return tokens