def normalize(text):
    text = text.decode('utf-8')
    text = re.sub(r'[a-zA-z]+://[^\s]*', '', text)
    text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)
    text = strip_accents_ascii(text)
    text = text.encode('utf-8')
    text = ' '.join(map(lambda x: x.lower(), TreebankWordTokenizer().tokenize(text)))
    return text
def clean_up(text):
    #Retira mencao a outros usuarios
    text = re.sub('@\S*', '', text)
    #Retira urls
    text = re.sub('http\S*', '', text)
    #Retira numeros
    text = re.sub('\d+', '', text)
    #Retira os termos 'user' e 'RT'
    text = re.sub('user', '', text)
    text = re.sub('RT', '', text)
    #Retira caracteres especiais e emojis
    text = re.sub(r'[^\w\s]', '', text)
    text = text.encode('ascii', 'ignore').decode('ascii')
    return text
Example #3
0
def normalize(text):
    text = text.decode('utf-8')
    text = re.sub(r'[a-zA-z]+://[^\s]*', '', text)
    text = re.sub(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', text)
    text = text.encode('utf-8')
    return text