Exemple #1
0
    def preprocess(self, text, min_len=2, max_len=240, remove_common=False):
        '''
        Function to remove stop words and perform lemmatization.

        INPUT:
            - text (str): numpy array of tweet text.
            - min_len (int): words with less characters than the value of min_len will be removed.
            - max_len (int): words with more character than the value of max_len will be removed.
            - remove_common (bool): add common words in the corpus to the stopwords list.
        OUTPUT: cleaned string
        '''

        result = []
        stopwords = STOPWORDS.copy()
        stopwords = set(stopwords)
        spanish = self._get_spanish_stopwords()
        stopwords.update(spanish)
        stopwords.update(['http', 'f**k', 'rt'])
        if remove_common:
            stopwords.update(['google', 'apple', 'twitter', 'microsoft'])

        for token in gensim.utils.simple_preprocess(text,
                                                    min_len=min_len,
                                                    max_len=max_len):
            if token not in stopwords:
                result.append(self._lemmatize_stemming(token))
        return result
Exemple #2
0
    def _preprocess(self,
                    text,
                    min_len=2,
                    max_len=240,
                    custom_stopwords=False):
        result = []
        if custom_stopwords:
            stopwords = STOPWORDS.copy()
            stopwords = set(stopwords)
            spanish = self._get_spanish_stopwords()
            custom = self._get_custom_stopwords()
            stopwords.update(spanish)
            # stopwords.update(['http', 'f**k', 'rt'])
            stopwords.update(custom)
        else:
            stopwords = STOPWORDS.copy()

        for token in gensim.utils.simple_preprocess(text,
                                                    min_len=min_len,
                                                    max_len=max_len):
            if token not in stopwords:
                result.append(self._lemmatize_stemming(token))
        return result
Exemple #3
0
    def tokenize_text(self, text):
        tokens = []
        # Adding to stopwords
        stopwords = STOPWORDS.copy()
        stopwords = set(stopwords)
        spanish = self._get_spanish_stopwords()
        stopwords.update(spanish)
        stopwords.update(['http', 'f**k', 'rt'])

        for sent in nltk.sent_tokenize(text):
            for word in nltk.word_tokenize(sent):
                # if word not in stopwords:
                if len(word) < 2:
                    continue
                # tokens.append(self._lemmatize_stemming(word.lower()))
                tokens.append(word.lower())
        return tokens