def get_words(sentence: str) -> set: text = ReTexto(sentence) return text.remove_html() \ .remove_mentions() \ .remove_tags() \ .remove_smiles(by='SMILING') \ .convert_specials() \ .convert_emoji() \ .remove_nochars(preserve_tilde=True) \ .remove_url() \ .remove_punctuation(by=' ') \ .remove_multispaces() \ .lower() \ .remove_stopwords() \ .split_words()
def test_pipe(self): text = ReTexto(SAMPLE).remove_html() \ .remove_mentions() \ .remove_tags() \ .remove_smiles(by='SMILING') \ .convert_specials() \ .convert_emoji() \ .remove_nochars(preserve_tilde=True) \ .remove_url() \ .remove_duplicate(r='a-jp-z') \ .remove_duplicate_vowels() \ .remove_duplicate_consonants() \ .remove_punctuation() \ .remove_multispaces() \ .lower() \ .split_words(uniques=True) assert_true(isinstance(text, object))
def test_remove_stopwords(self): text = ReTexto(SAMPLE) assert_true(isinstance(text.remove_stopwords(), object))
def test_remove_multispaces(self): text = ReTexto(SAMPLE) assert_true(isinstance(text.remove_multispaces(), object))
def test_remove_punctuation(self): text = ReTexto(SAMPLE) assert_true(isinstance(text.remove_punctuation(), object))
def test_remove_duplicate_consonants(self): text = ReTexto(SAMPLE) assert_true(isinstance(text.remove_duplicate_consonants(), object))
def test_remove_nochars_2(self): text = ReTexto('perÚ - AÑo') assert_true( isinstance(text.remove_nochars(preserve_tilde=True), object))
def test_strip_acents(self): text = ReTexto('perú') assert_true(text.strip_accents().text == 'peru')
def test_convert_emoji(self): text = ReTexto(SAMPLE) assert_true(isinstance(text.convert_emoji(), object))
def test_convert_specials(self): text = ReTexto(SAMPLE) assert_true(isinstance(text.convert_specials(), object))
def test_remove_html(self): text = ReTexto(SAMPLE) assert_true(isinstance(text.remove_html(), object))
def test_split_words(self): text = ReTexto(SAMPLE) assert_true(isinstance(text.split_words(), object))
def test_lower(self): text = ReTexto(SAMPLE) assert_true(isinstance(text.lower(), object))
# -*- coding: UTF-8 -*- from retexto import ReTexto if __name__ == '__main__': s = '@Edux87, i need this www.google.com | https://github.com <br> \ <strong>UserName: çarlos </strong> \ i\'m from Perú 😛 \ FeLiZ aÑo NuEVo \ #Friends #Text jajajajaja so fffunny \ loooveee thiiis 😌😎 \ @florenciaflor19 Si!!! sé vo… 🐷JUANA🐷 \ smiles! hahaha jejeje jojojo jujuju jijijijajaja 😂' text = ReTexto(s) s = text.remove_html() \ .lower() \ .remove_mentions() \ .remove_tags() \ .remove_smiles(by='smiling') \ .remove_url() \ .remove_duplicate(r='a-km-qs-y') \ .remove_duplicate_vowels() \ .remove_duplicate_consonants() \ .remove_multispaces() \ .remove_punctuation(by=' ') \ .convert_emoji() \ .remove_nochars(preserve_tilde=True) \ .remove_stopwords() \ .split_words() print(s) s = 'San Juan de Lurigancho ¿Por qué es una mala idea destruir un complejo \