def clean_word2vec(text):

    #Remove mentions, usernames (@USER)
    text = re.sub("\s*@USER\s*", ' user ', text)

    #Remove URL
    text = re.sub("\s*URL\s*", ' url ', text)

    #Numbers
    #text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>", text)
    text = re.sub('[0-9]{5,}', '#####', text)
    text = re.sub('[0-9]{4}', '####', text)
    text = re.sub('[0-9]{3}', '###', text)
    text = re.sub('[0-9]{2}', '##', text)

    #Emojis
    text = emoji_extra.demojize(text)

    #Remove skin tones
    text = re.sub(" medium-dark skin tone", "", text)
    text = re.sub(" medium-light skin tone", "", text)
    text = re.sub(" medium skin tone", "", text)
    text = re.sub(" dark skin tone", "", text)
    text = re.sub(" light skin tone", "", text)

    #Hashtag
    #Source: https://pypi.org/project/wordsegment/
    text = re.sub(r"#(\w+)", segment_hashtag_word2vec, text)

    #Punctuation
    text = re.sub("&amp;", "and", text)
    text = re.sub("&lt;", "<", text)
    text = re.sub("&gt", ">", text)
    text = re.sub("&", "and", text)

    #Lowercase
    text = text.lower()

    #Replace contractions and slang of word
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")

    text = re.sub("i'm", "i am", text)
    #text = re.sub("i’m", "i am", text)
    text = re.sub("i've", "i have", text)
    #text = re.sub("i’ve", "i have", text)
    text = contractions.fix(text, slang=True)

    #Punctuation
    text = re.sub(r'[^\w\s]', ' ', text)

    #text = " ".join(tt.tokenize(text))
    stopwords = ['to', 'and', 'a', 'of']
    tokenized_text = social_tokenizer(text)
    filtered_stopwords = [w for w in tokenized_text if w not in stopwords]

    text = " ".join(filtered_stopwords)

    return text
Beispiel #2
0
def clean_fasttext(text):

    #Remove mentions, usernames (@USER)
    text = re.sub("\s*@USER\s*", ' user ', text)

    #Remove URL
    text = re.sub("\s*URL\s*", ' url ', text)

    #Numbers
    #text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>", text)

    #Emojis
    text = emoji_extra.demojize(text)

    #Remove skin tones
    text = re.sub(" medium-dark skin tone", "", text)
    text = re.sub(" medium-light skin tone", "", text)
    text = re.sub(" medium skin tone", "", text)
    text = re.sub(" dark skin tone", "", text)
    text = re.sub(" light skin tone", "", text)

    #Hashtag
    #Source: https://pypi.org/project/wordsegment/
    text = re.sub(r"#(\w+)", segment_hashtag_fasttext, text)

    #Lowercase
    text = re.sub(r"([A-Z]){2,}", text.lower() + " allcaps ", text)
    text = text.lower()

    #Replace contractions and slang of word
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")

    text = re.sub("i'm", "i am", text)
    text = re.sub("i've", "i have", text)
    text = contractions.fix(text, slang=True)

    #text = " ".join(tt.tokenize(text))
    text = " ".join(social_tokenizer(text))

    return text
def clean_text(text,
               remove_hashtags=False,
               remove_emojis=False,
               remove_punt_number_special_chars=False,
               remove_stopwords=False,
               apply_stemming=False):
    """Clean text
    Args:
        text: (str) Text
        remove_punt_number_special_chars: (bool) Remove punctuations, numbers and special characters
        remove_stopwords: (bool) Remove stopwords
        apply_stemming: (bool) Apply stemming on the words on the text
    """
    #Remove emojis
    if (remove_emojis):
        text = re.sub(":[a-zA-Z\-\_]*:", "",
                      emoji.demojize(text))  #:hear-no-evil_monkey:
        text = re.sub(":\w+:", "", emoji.demojize(text))
        text = re.sub(":\w+\’\w+:", "", emoji.demojize(text))  #:woman's_boot:
    else:
        text = emoji_extra.demojize(text)

        #Remove skin tones
        text = re.sub(" medium-dark skin tone", "", text)
        text = re.sub(" medium-light skin tone", "", text)
        text = re.sub(" medium skin tone", "", text)
        text = re.sub(" dark skin tone", "", text)
        text = re.sub(" light skin tone", "", text)

    #Remove mentions, usernames (@USER)
    text = re.sub("\s*@USER\s*", '', text)

    #Remove URL
    text = re.sub("\s*URL\s*", '', text)

    #And
    text = re.sub("&amp;", "and", text)
    text = re.sub("&lt;", "<", text)
    text = re.sub("&gt", ">", text)
    text = re.sub("&", "and", text)

    #Replace contractions and slang of word
    text = re.sub("i'm", "I'm", text)
    text = contractions.fix(text, slang=True)

    #Lowercase
    text = text.lower()

    #Remove Hashtags + Words
    if (remove_hashtags):
        text = re.sub("#\s*\w+\s*", '', text)
    else:
        text = re.sub("#\s*\w+\s*", '', text)

    #Remove repeating whitespaces
    text = re.sub("\s[2, ]", " ", text)

    #Remove non ascii characters
    text.encode("ascii", errors="ignore").decode()

    #Remove punctuations, numbers and special characters (remove emoticons)
    if remove_punt_number_special_chars:
        text = re.sub('[^a-zA-Z]', ' ', text)

    #Tokenize text
    tt = TweetTokenizer(preserve_case=False,
                        strip_handles=True,
                        reduce_len=True)

    text_tokens = tt.tokenize(text)

    #Remove stopwords
    if remove_stopwords:
        stopwords = set(STOPWORDS)
        text_tokens = [
            token for token in text_tokens if token not in stopwords
        ]

    #Stemming
    if apply_stemming:
        text_stem = [stemmer.stem(token) for token in text_tokens]

    clean = " ".join(text_tokens)

    return clean
def clean_glove(text):

    #Remove mentions, usernames (@USER)
    text = re.sub("\s*@USER\s*", '<user>', text)

    #Remove URL
    text = re.sub("\s*URL\s*", '<url>', text)

    #Numbers
    text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>", text)

    #Emoticons
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    text = re.sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes),
                  "<smile>", text)
    text = re.sub(r"{}{}p+".format(eyes, nose), "<lolface>", text)
    text = re.sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes),
                  "<sadface>", text)
    text = re.sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>", text)

    #Emojis
    text = emoji_extra.demojize(text)

    #Remove skin tones
    text = re.sub(" medium-dark skin tone", "", text)
    text = re.sub(" medium-light skin tone", "", text)
    text = re.sub(" medium skin tone", "", text)
    text = re.sub(" dark skin tone", "", text)
    text = re.sub(" light skin tone", "", text)

    #Hashtag
    #Source: https://pypi.org/project/wordsegment/
    text = re.sub(r"#(\w+)", segment_hashtag_glove, text)

    #Repeat
    # Mark punctuation repetitions (eg. "!!!" => "! <REPEAT>")
    text = re.sub(r"([!?.]){2,}", r"\1 <repeat>", text)

    #Elong
    # Mark elongated words (eg. "wayyyy" => "way <ELONG>")
    text = re.sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>", text)

    #Lowercase
    text = re.sub(r"([A-Z]){2,}", text.lower() + " <allcaps>", text)
    text = text.lower()

    #Replace contractions and slang of word
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")

    text = re.sub("i'm", "i am", text)
    text = re.sub("i've", "i have", text)
    text = contractions.fix(text, slang=True)

    #text = " ".join(tt.tokenize(text))
    text = " ".join(social_tokenizer(text))

    return text