Ejemplo n.º 1
0
 def test_replace_currency_symbols(self):
     text = '$1.00 equals £0.67 equals €0.91.'
     proc_text1 = 'USD1.00 equals GBP0.67 equals EUR0.91.'
     proc_text2 = '*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.'
     self.assertEqual(
         preprocess.replace_currency_symbols(text, replace_with=None),
         proc_text1)
     self.assertEqual(
         preprocess.replace_currency_symbols(text, replace_with='*CUR* '),
         proc_text2)
Ejemplo n.º 2
0
 def test_replace_currency_symbols(self):
     tests = [
         ('$1.00 equals £0.67 equals €0.91.',
          'USD1.00 equals GBP0.67 equals EUR0.91.',
          '*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.'),
         ('this zebra costs $100.',
          'this zebra costs USD100.',
          'this zebra costs *CUR* 100.'),
         ]
     for text, proc_text1, proc_text2 in tests:
         self.assertEqual(preprocess.replace_currency_symbols(text, replace_with=None), proc_text1)
         self.assertEqual(preprocess.replace_currency_symbols(text, replace_with='*CUR* '), proc_text2)
def preprocess_unicode(raw_text):
    raw_text = preprocess.transliterate_unicode(raw_text.lower())
    raw_text = preprocess.replace_urls(raw_text, replace_with=u'')
    raw_text = preprocess.replace_emails(raw_text, replace_with=u'')
    raw_text = preprocess.replace_phone_numbers(raw_text, replace_with=u'')
    raw_text = preprocess.replace_numbers(raw_text, replace_with=u'')
    raw_text = preprocess.replace_currency_symbols(raw_text, replace_with=u'')
    return raw_text
Ejemplo n.º 4
0
def test_replace_currency_symbols():
    tests = [
        (
            "$1.00 equals £0.67 equals €0.91.",
            "USD1.00 equals GBP0.67 equals EUR0.91.",
            "*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.",
        ),
        (
            "this zebra costs $100.",
            "this zebra costs USD100.",
            "this zebra costs *CUR* 100.",
        ),
    ]
    for text, proc_text1, proc_text2 in tests:
        assert (preprocess.replace_currency_symbols(
            text, replace_with=None) == proc_text1)
        assert (preprocess.replace_currency_symbols(
            text, replace_with="*CUR* ") == proc_text2)
Ejemplo n.º 5
0
    def clean_tweet(self, text):
        # FIXED UNICODE
        text = preprocess.fix_bad_unicode(text)

        # GET TEXT ONLY FROM HTML
        text = BeautifulSoup(text, features='lxml').getText()
        # UN-PACK CONTRACTIONS
        text = preprocess.unpack_contractions(text)

        # REMOVE URL
        text = preprocess.replace_urls(text)

        # REMOVE EMAILS
        text = preprocess.replace_emails(text)

        # REMOVE PHONE NUMBERS
        text = preprocess.replace_phone_numbers(text)

        # REMOVE NUMBERS
        text = preprocess.replace_numbers(text)

        # REMOVE CURRENCY
        text = preprocess.replace_currency_symbols(text)

        # REMOVE ACCENTS
        text = preprocess.remove_accents(text)

        # CONVERT EMOJIS TO TEXT
        words = text.split()
        reformed = [
            self.SMILEY[word] if word in self.SMILEY else word
            for word in words
        ]
        text = " ".join(reformed)
        text = emoji.demojize(text)
        text = text.replace(":", " ")
        text = ' '.join(text.split())

        # SPLIT ATTACHED WORDS
        text = ' '.join(re.findall('[A-Z][^A-Z]*', text))

        # SPLIT UNDERSCORE WORDS
        text = text.replace('_', ' ')

        # REMOVE PUNCTUATION
        text = preprocess.remove_punct(text)

        # Remove numbers
        text = re.sub(r'\d', '', text)

        # REMOVE WORDS LESS THAN 3 CHARACTERS
        text = re.sub(r'\b\w{1,2}\b', '', text)

        # NORMALIZE WHITESPACE
        text = preprocess.normalize_whitespace(text)

        return text
Ejemplo n.º 6
0
 def clean_text(self, raw_text):
     raw_text = self.strip_tags(raw_text)
     raw_text = raw_text.lower()
     raw_text = preprocess.remove_punct(raw_text)
     raw_text = preprocess.transliterate_unicode(raw_text)
     raw_text = preprocess.replace_urls(raw_text, replace_with='')
     raw_text = preprocess.replace_emails(raw_text, replace_with='')
     raw_text = preprocess.replace_phone_numbers(raw_text, replace_with='')
     raw_text = preprocess.replace_numbers(raw_text, replace_with='')
     raw_text = preprocess.replace_currency_symbols(raw_text,
                                                    replace_with='')
     return raw_text
Ejemplo n.º 7
0
 def test_replace_currency_symbols(self):
     text = '$1.00 equals £0.67 equals €0.91.'
     proc_text1 = 'USD1.00 equals GBP0.67 equals EUR0.91.'
     proc_text2 = '*CUR* 1.00 equals *CUR* 0.67 equals *CUR* 0.91.'
     self.assertEqual(preprocess.replace_currency_symbols(text, replace_with=None), proc_text1)
     self.assertEqual(preprocess.replace_currency_symbols(text, replace_with='*CUR* '), proc_text2)