Example #1
0
def get_words(sentence: str) -> set:
    text = ReTexto(sentence)
    return text.remove_html() \
        .remove_mentions() \
        .remove_tags() \
        .remove_smiles(by='SMILING') \
        .convert_specials() \
        .convert_emoji() \
        .remove_nochars(preserve_tilde=True) \
        .remove_url() \
        .remove_punctuation(by=' ') \
        .remove_multispaces() \
        .lower() \
        .remove_stopwords() \
        .split_words()
Example #2
0
    def test_pipe(self):
        text = ReTexto(SAMPLE).remove_html() \
            .remove_mentions() \
            .remove_tags() \
            .remove_smiles(by='SMILING') \
            .convert_specials() \
            .convert_emoji() \
            .remove_nochars(preserve_tilde=True) \
            .remove_url() \
            .remove_duplicate(r='a-jp-z') \
            .remove_duplicate_vowels() \
            .remove_duplicate_consonants() \
            .remove_punctuation() \
            .remove_multispaces() \
            .lower() \
            .split_words(uniques=True)

        assert_true(isinstance(text, object))
Example #3
0
 def test_remove_stopwords(self):
     text = ReTexto(SAMPLE)
     assert_true(isinstance(text.remove_stopwords(), object))
Example #4
0
 def test_remove_multispaces(self):
     text = ReTexto(SAMPLE)
     assert_true(isinstance(text.remove_multispaces(), object))
Example #5
0
 def test_remove_punctuation(self):
     text = ReTexto(SAMPLE)
     assert_true(isinstance(text.remove_punctuation(), object))
Example #6
0
 def test_remove_duplicate_consonants(self):
     text = ReTexto(SAMPLE)
     assert_true(isinstance(text.remove_duplicate_consonants(), object))
Example #7
0
 def test_remove_nochars_2(self):
     text = ReTexto('perÚ - AÑo')
     assert_true(
         isinstance(text.remove_nochars(preserve_tilde=True), object))
Example #8
0
 def test_strip_acents(self):
     text = ReTexto('perú')
     assert_true(text.strip_accents().text == 'peru')
Example #9
0
 def test_convert_emoji(self):
     text = ReTexto(SAMPLE)
     assert_true(isinstance(text.convert_emoji(), object))
Example #10
0
 def test_convert_specials(self):
     text = ReTexto(SAMPLE)
     assert_true(isinstance(text.convert_specials(), object))
Example #11
0
 def test_remove_html(self):
     text = ReTexto(SAMPLE)
     assert_true(isinstance(text.remove_html(), object))
Example #12
0
 def test_split_words(self):
     text = ReTexto(SAMPLE)
     assert_true(isinstance(text.split_words(), object))
Example #13
0
 def test_lower(self):
     text = ReTexto(SAMPLE)
     assert_true(isinstance(text.lower(), object))
Example #14
0
# -*- coding: UTF-8 -*-
from retexto import ReTexto

if __name__ == '__main__':
    s = '@Edux87, i need this www.google.com | https://github.com <br> \
        <strong>UserName: çarlos </strong> \
        i\'m from Perú 😛 \
        FeLiZ aÑo NuEVo \
        #Friends #Text jajajajaja so fffunny  \
        loooveee thiiis 😌😎 \
        @florenciaflor19 Si!!! sé vo… 🐷JUANA🐷 \
        smiles! hahaha jejeje jojojo jujuju jijijijajaja 😂'

    text = ReTexto(s)
    s = text.remove_html() \
            .lower() \
            .remove_mentions() \
            .remove_tags() \
            .remove_smiles(by='smiling') \
            .remove_url() \
            .remove_duplicate(r='a-km-qs-y') \
            .remove_duplicate_vowels() \
            .remove_duplicate_consonants() \
            .remove_multispaces() \
            .remove_punctuation(by=' ') \
            .convert_emoji() \
            .remove_nochars(preserve_tilde=True) \
            .remove_stopwords() \
            .split_words()
    print(s)
    s = 'San Juan de Lurigancho ¿Por qué es una mala idea destruir un complejo \