def clean_word2vec(text): #Remove mentions, usernames (@USER) text = re.sub("\s*@USER\s*", ' user ', text) #Remove URL text = re.sub("\s*URL\s*", ' url ', text) #Numbers #text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>", text) text = re.sub('[0-9]{5,}', '#####', text) text = re.sub('[0-9]{4}', '####', text) text = re.sub('[0-9]{3}', '###', text) text = re.sub('[0-9]{2}', '##', text) #Emojis text = emoji_extra.demojize(text) #Remove skin tones text = re.sub(" medium-dark skin tone", "", text) text = re.sub(" medium-light skin tone", "", text) text = re.sub(" medium skin tone", "", text) text = re.sub(" dark skin tone", "", text) text = re.sub(" light skin tone", "", text) #Hashtag #Source: https://pypi.org/project/wordsegment/ text = re.sub(r"#(\w+)", segment_hashtag_word2vec, text) #Punctuation text = re.sub("&", "and", text) text = re.sub("<", "<", text) text = re.sub(">", ">", text) text = re.sub("&", "and", text) #Lowercase text = text.lower() #Replace contractions and slang of word specials = ["’", "‘", "´", "`"] for s in specials: text = text.replace(s, "'") text = re.sub("i'm", "i am", text) #text = re.sub("i’m", "i am", text) text = re.sub("i've", "i have", text) #text = re.sub("i’ve", "i have", text) text = contractions.fix(text, slang=True) #Punctuation text = re.sub(r'[^\w\s]', ' ', text) #text = " ".join(tt.tokenize(text)) stopwords = ['to', 'and', 'a', 'of'] tokenized_text = social_tokenizer(text) filtered_stopwords = [w for w in tokenized_text if w not in stopwords] text = " ".join(filtered_stopwords) return text
def clean_fasttext(text): #Remove mentions, usernames (@USER) text = re.sub("\s*@USER\s*", ' user ', text) #Remove URL text = re.sub("\s*URL\s*", ' url ', text) #Numbers #text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>", text) #Emojis text = emoji_extra.demojize(text) #Remove skin tones text = re.sub(" medium-dark skin tone", "", text) text = re.sub(" medium-light skin tone", "", text) text = re.sub(" medium skin tone", "", text) text = re.sub(" dark skin tone", "", text) text = re.sub(" light skin tone", "", text) #Hashtag #Source: https://pypi.org/project/wordsegment/ text = re.sub(r"#(\w+)", segment_hashtag_fasttext, text) #Lowercase text = re.sub(r"([A-Z]){2,}", text.lower() + " allcaps ", text) text = text.lower() #Replace contractions and slang of word specials = ["’", "‘", "´", "`"] for s in specials: text = text.replace(s, "'") text = re.sub("i'm", "i am", text) text = re.sub("i've", "i have", text) text = contractions.fix(text, slang=True) #text = " ".join(tt.tokenize(text)) text = " ".join(social_tokenizer(text)) return text
def clean_text(text, remove_hashtags=False, remove_emojis=False, remove_punt_number_special_chars=False, remove_stopwords=False, apply_stemming=False): """Clean text Args: text: (str) Text remove_punt_number_special_chars: (bool) Remove punctuations, numbers and special characters remove_stopwords: (bool) Remove stopwords apply_stemming: (bool) Apply stemming on the words on the text """ #Remove emojis if (remove_emojis): text = re.sub(":[a-zA-Z\-\_]*:", "", emoji.demojize(text)) #:hear-no-evil_monkey: text = re.sub(":\w+:", "", emoji.demojize(text)) text = re.sub(":\w+\’\w+:", "", emoji.demojize(text)) #:woman's_boot: else: text = emoji_extra.demojize(text) #Remove skin tones text = re.sub(" medium-dark skin tone", "", text) text = re.sub(" medium-light skin tone", "", text) text = re.sub(" medium skin tone", "", text) text = re.sub(" dark skin tone", "", text) text = re.sub(" light skin tone", "", text) #Remove mentions, usernames (@USER) text = re.sub("\s*@USER\s*", '', text) #Remove URL text = re.sub("\s*URL\s*", '', text) #And text = re.sub("&", "and", text) text = re.sub("<", "<", text) text = re.sub(">", ">", text) text = re.sub("&", "and", text) #Replace contractions and slang of word text = re.sub("i'm", "I'm", text) text = contractions.fix(text, slang=True) #Lowercase text = text.lower() #Remove Hashtags + Words if (remove_hashtags): text = re.sub("#\s*\w+\s*", '', text) else: text = re.sub("#\s*\w+\s*", '', text) #Remove repeating whitespaces text = re.sub("\s[2, ]", " ", text) #Remove non ascii characters text.encode("ascii", errors="ignore").decode() #Remove punctuations, numbers and special characters (remove emoticons) if remove_punt_number_special_chars: text = re.sub('[^a-zA-Z]', ' ', text) #Tokenize text tt = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) text_tokens = tt.tokenize(text) #Remove stopwords if remove_stopwords: stopwords = set(STOPWORDS) text_tokens = [ token for token in text_tokens if token not in stopwords ] #Stemming if apply_stemming: text_stem = [stemmer.stem(token) for token in text_tokens] clean = " ".join(text_tokens) return clean
def clean_glove(text): #Remove mentions, usernames (@USER) text = re.sub("\s*@USER\s*", '<user>', text) #Remove URL text = re.sub("\s*URL\s*", '<url>', text) #Numbers text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>", text) #Emoticons eyes = r"[8:=;]" nose = r"['`\-]?" text = re.sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>", text) text = re.sub(r"{}{}p+".format(eyes, nose), "<lolface>", text) text = re.sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>", text) text = re.sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>", text) #Emojis text = emoji_extra.demojize(text) #Remove skin tones text = re.sub(" medium-dark skin tone", "", text) text = re.sub(" medium-light skin tone", "", text) text = re.sub(" medium skin tone", "", text) text = re.sub(" dark skin tone", "", text) text = re.sub(" light skin tone", "", text) #Hashtag #Source: https://pypi.org/project/wordsegment/ text = re.sub(r"#(\w+)", segment_hashtag_glove, text) #Repeat # Mark punctuation repetitions (eg. "!!!" => "! <REPEAT>") text = re.sub(r"([!?.]){2,}", r"\1 <repeat>", text) #Elong # Mark elongated words (eg. "wayyyy" => "way <ELONG>") text = re.sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>", text) #Lowercase text = re.sub(r"([A-Z]){2,}", text.lower() + " <allcaps>", text) text = text.lower() #Replace contractions and slang of word specials = ["’", "‘", "´", "`"] for s in specials: text = text.replace(s, "'") text = re.sub("i'm", "i am", text) text = re.sub("i've", "i have", text) text = contractions.fix(text, slang=True) #text = " ".join(tt.tokenize(text)) text = " ".join(social_tokenizer(text)) return text