Exemple #1
0
    def clean_tweet(self, text):
        # FIXED UNICODE
        text = preprocess.fix_bad_unicode(text)

        # GET TEXT ONLY FROM HTML
        text = BeautifulSoup(text, features='lxml').getText()
        # UN-PACK CONTRACTIONS
        text = preprocess.unpack_contractions(text)

        # REMOVE URL
        text = preprocess.replace_urls(text)

        # REMOVE EMAILS
        text = preprocess.replace_emails(text)

        # REMOVE PHONE NUMBERS
        text = preprocess.replace_phone_numbers(text)

        # REMOVE NUMBERS
        text = preprocess.replace_numbers(text)

        # REMOVE CURRENCY
        text = preprocess.replace_currency_symbols(text)

        # REMOVE ACCENTS
        text = preprocess.remove_accents(text)

        # CONVERT EMOJIS TO TEXT
        words = text.split()
        reformed = [
            self.SMILEY[word] if word in self.SMILEY else word
            for word in words
        ]
        text = " ".join(reformed)
        text = emoji.demojize(text)
        text = text.replace(":", " ")
        text = ' '.join(text.split())

        # SPLIT ATTACHED WORDS
        text = ' '.join(re.findall('[A-Z][^A-Z]*', text))

        # SPLIT UNDERSCORE WORDS
        text = text.replace('_', ' ')

        # REMOVE PUNCTUATION
        text = preprocess.remove_punct(text)

        # Remove numbers
        text = re.sub(r'\d', '', text)

        # REMOVE WORDS LESS THAN 3 CHARACTERS
        text = re.sub(r'\b\w{1,2}\b', '', text)

        # NORMALIZE WHITESPACE
        text = preprocess.normalize_whitespace(text)

        return text
Exemple #2
0
def test_unpack_contractions():
    text = "Y'all can't believe you're not who they've said I'll become, but shouldn't."
    proc_text = "You all can not believe you are not who they have said I will become, but should not."
    assert preprocess.unpack_contractions(text) == proc_text
 def test_unpack_contractions(self):
     text = "Y'all can't believe you're not who they've said I'll become, but shouldn't."
     proc_text = "You all can not believe you are not who they have said I will become, but should not."
     self.assertEqual(preprocess.unpack_contractions(text), proc_text)